<a href="https://colab.research.google.com/github/muskan9887/Hazelnuts_logistic_regression1/blob/main/hazelnuts_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from math import ceil
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve,auc

In [4]:
def sigmoid(z):
  return 1.0 / (1 + np.exp(-z))



In [5]:
#Loading Data
hazel_df = pd.read_csv("/content/hazelnuts.txt",sep="\t",header=None)
hazel_df = hazel_df.transpose()
hazel_df.columns = ["sample_id","length","width","thickness","surface_area","mass","compactness",
"hardness","shell_top_radius","water_content","carbohydrate_content","variety"]


In [8]:
hazel_df.head()

Unnamed: 0,sample_id,length,width,thickness,surface_area,mass,compactness,hardness,shell_top_radius,water_content,carbohydrate_content,variety
0,71,11.67,12.8025,8.055074738,34.65,1375.5,0.93005,19.145,4.4604,0.048667685,0.175,c_avellana
1,72,13.86,13.0995,7.349906812,38.1,1439.55,0.93401,8.78,4.7844,0.048826089,0.167,c_avellana
2,1,20.53,15.5925,9.565426582,49.89,1623.3,0.96217,5.12,5.2893,0.049521325,0.174,c_americana
3,73,14.13,12.222,7.182948645,35.43,1412.25,0.90178,13.694,4.8168,0.049594663,0.167,c_avellana
4,2,15.85,14.724,8.622661318,43.29,1512.0,0.96261,10.925,4.6296,0.050384315,0.173,c_americana


In [9]:
#Feature Selection
all_features = hazel_df.drop(["variety","sample_id"],axis=1)
target_feature = hazel_df["variety"]
all_features.head()

Unnamed: 0,length,width,thickness,surface_area,mass,compactness,hardness,shell_top_radius,water_content,carbohydrate_content
0,11.67,12.8025,8.055074738,34.65,1375.5,0.93005,19.145,4.4604,0.048667685,0.175
1,13.86,13.0995,7.349906812,38.1,1439.55,0.93401,8.78,4.7844,0.048826089,0.167
2,20.53,15.5925,9.565426582,49.89,1623.3,0.96217,5.12,5.2893,0.049521325,0.174
3,14.13,12.222,7.182948645,35.43,1412.25,0.90178,13.694,4.8168,0.049594663,0.167
4,15.85,14.724,8.622661318,43.29,1512.0,0.96261,10.925,4.6296,0.050384315,0.173


In [10]:
#Normalizing values
from sklearn import preprocessing
x = all_features.values.astype(float) #returns a numpy array of type float
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
scaled_features = pd.DataFrame(x_scaled)
scaled_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.150901,0.153243,0.324641,0.090652,0.142562,0.339383,0.773629,0.215165,0.0,1.0
1,0.274212,0.200285,0.151451,0.199245,0.268595,0.372051,0.324396,0.392418,0.003034,0.111111
2,0.649775,0.595153,0.695584,0.570349,0.630165,0.604356,0.165767,0.668636,0.016351,0.888889
3,0.289414,0.061297,0.110446,0.115203,0.214876,0.106171,0.537375,0.410143,0.017756,0.111111
4,0.386261,0.457591,0.46404,0.362606,0.411157,0.607985,0.417363,0.30773,0.032881,0.777778


In [12]:
#Preprocessing dataset for algorithm
Y=list()
X=np.array(scaled_features)
y=np.array(target_feature)
label_dict = {'c_avellana':0, 'c_americana':1, 'c_cornuta':2}
_label_dict = {0 :'c_avellana' , 1 :'c_americana' , 2 :'c_cornuta'}
for i in y:
  Y.append(label_dict[i])
y=np.array(Y,dtype=int)
y_unique = np.unique(y)

In [13]:
# Modelling logistic regression
def logisticRegression(X, y, theta, num_iter):
    # Sigmoid function
    def sigmoid(z):
        return 1.0 / (1 + np.exp(-z))

    # Cost function
    def costFunc(theta, X, y, lr=0.001):
        h = sigmoid(X.dot(theta))
        r = (lr / (2 * len(y))) * np.sum(theta ** 2)
        return (1 / len(y)) * (-y.T.dot(np.log(h)) - (1 - y).T.dot(np.log(1 - h))) + r

    # Gradient descent function
    def gradientFunc(theta, X, y, lr=0.001):
        m, n = X.shape
        theta = theta.reshape((n, 1))
        y = y.reshape((m, 1))
        h = sigmoid(X.dot(theta))
        r = lr * theta / m
        return ((1 / m) * X.T.dot(h - y)) + r

    # Finding best theta
    for i in range(num_iter):
        lineq = np.dot(X, theta)
        h = sigmoid(lineq)
        # Calculating cost function of each class
        cost = costFunc(theta, X, y)
        cost = cost.sum(axis=0)
        # Applying gradient descent to find new theta
        delta = gradientFunc(theta, X, y)
        theta = theta - delta
    return theta


In [14]:
# Model training
score = list()
missclass = 0

# KFold cross validation
for fold in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    # OneVsRest
    i, k, n = 0, 3, 10  # No of classes and features
    all_theta = np.zeros((k, n))
    for hazelnut in y_unique:
        np_y_train = np.array(y_train == hazelnut, dtype=int)
        best_theta = logisticRegression(X_train, np_y_train, np.zeros((n, 1)), 10000)
        all_theta[i] = best_theta.T
        i += 1

    # Predictions
    prediction = sigmoid(X_test.dot(all_theta.T))
    prediction = prediction.tolist()
    pred = list()
    act = list()
    for _i, i in enumerate(prediction):
        pred.append(_label_dict[i.index(max(i))])
        if _label_dict[i.index(max(i))] != _label_dict[y_test[_i]]:
            missclass += 1
        act.append(_label_dict[y_test[_i]])
    score.append(round(accuracy_score(pred, act) * 100, 2))
    print("The score for Logistic Regression for fold", fold + 1, "is: ", score[fold], '%', " No of misclassfied",
          missclass)
print("The overall score for Logistic Regression is: ", round(sum(score) / len(score), 2), '%')


The score for Logistic Regression for fold 1 is:  91.04 %  No of misclassfied 6
The score for Logistic Regression for fold 2 is:  94.03 %  No of misclassfied 10
The score for Logistic Regression for fold 3 is:  95.52 %  No of misclassfied 13
The score for Logistic Regression for fold 4 is:  89.55 %  No of misclassfied 20
The score for Logistic Regression for fold 5 is:  95.52 %  No of misclassfied 23
The score for Logistic Regression for fold 6 is:  95.52 %  No of misclassfied 26
The score for Logistic Regression for fold 7 is:  89.55 %  No of misclassfied 33
The score for Logistic Regression for fold 8 is:  91.04 %  No of misclassfied 39
The score for Logistic Regression for fold 9 is:  89.55 %  No of misclassfied 46
The score for Logistic Regression for fold 10 is:  91.04 %  No of misclassfied 52
The overall score for Logistic Regression is:  92.24 %
