# Logistic Regression
## 1. Spam filtering

In [74]:
import numpy as np
import pandas as pd
df = pd.read_csv('data/SMSSpamCollection/SMSSpamCollection', delimiter='\t', header=None)
print(df.head())
print ('Number of spam messages:', df[df[0] == 'spam'][0].count())
print ('Number of ham messages:', df[df[0] == 'ham'][0].count())

      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Number of spam messages: 747
Number of ham messages: 4825


In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.datasets as datasets
import matplotlib.pyplot as plt
from sklearn import metrics 
from sklearn.metrics import classification_report

In [76]:
df = pd.read_csv('data/SMSSpamCollection/SMSSpamCollection', delimiter='\t', header=None)
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])

In [77]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

In [78]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
for prediction in predictions[:5]:
    print ('Prediction: %s.' % prediction)
    
print (X_test_raw[:5])

Prediction: ham.
Prediction: ham.
Prediction: spam.
Prediction: ham.
Prediction: ham.
2671    Yes. They replied my mail. I'm going to the ma...
4       Nah I don't think he goes to usf, he lives aro...
1007    Panasonic & BluetoothHdset FREE. Nokia FREE. M...
2961                     Sir send to group mail check it.
5362    I'm in inside office..still filling forms.don ...
Name: 1, dtype: object


### Evaluate the prediction

In [79]:
from sklearn.metrics import accuracy_score
y_pred, y_true = [0, 1, 1, 0], [1, 1, 1, 1]
print ('Accuracy:', accuracy_score(y_true, y_pred))

Accuracy: 0.5


## Logistic Regression from Scratch

In [80]:
diabetes = datasets.load_diabetes()
diabetes["feature_names"]

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [81]:
X=diabetes.data[:, np.newaxis, 2]
y=diabetes["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
#print(X_train.shape,y_train.shape)

N_train=y_train.shape[0]
N_test=y_test.shape[0]


In [82]:
def sigmoid(x): 
    return 1 / (1 + np.exp(-x))

In [83]:
def single_logic_likelihood(xi,yi):
    # calculates the logic likelihood for the ith x and y
    predicted_value=sigmoid(xi)
    
    return yi*np.log(predicted_value)+(1-yi)*np.log(1-predicted_value)
       

In [84]:

def logic_likelihood(Xs,ys):
    # calculates the su of logic likelihood
    sum = 0
    size=len(Xs)
    for _ in range (size):
        x=Xs[_]
        y=ys[_]
        sum+=single_logic_likelihood(x,y)
    
    return sum
 

In [85]:
def logistic_regression(Xs, ys, learning_rate, iteration):
    for _ in range (iteration):
        predictions=sigmoid(Xs)
        output_error=ys-predictions
        gradient=np.dot(Xs.T, output_error)
        if _ % 10000 == 0:
            print (logic_likelihood(Xs, ys))                

In [86]:
weights = logistic_regression(X_train, y_train, 0.001,3000)

[473.35238556]


In [87]:
def plot_training_data():
    x = np.linspace(-0.1, 0.18, 500)
    plt.plot(X_train, y_train, 'ro')
    plt.plot(x,m*x+b);
    plt.show()

In [88]:
def plot_test_data():
    x = np.linspace(-0.1, 0.18, 500)
    plt.plot(X_test, y_test, 'ro')
    plt.plot(x,m*x+b);
    plt.show()