<a href="https://www.kaggle.com/mindadeepam/classification-algorithms?scriptVersionId=89120031" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install openpyxl

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm


In [None]:
pd.set_option('display.max_columns', None)

df = pd.read_excel("../input/pumpkin-seeds-dataset/Pumpkin_Seeds_Dataset.xlsx")
# test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.dtypes           # 12 numeric features - 1 object class label
df.Class.unique()   # 2 unique classes

In [None]:
df.info()           # no null values in df

## Exploratory Data Analysis

In [None]:
sns.countplot(x='Class',data=df)      ## over 1000 samples for both

In [None]:
sns.pairplot(data=df,  hue='Class')

In [None]:
# Equiv_Diameter, Convex_Area and Area seem highly correalated
temp_data = df[['Equiv_Diameter','Convex_Area', 'Area', 'Class']]
sns.pairplot(temp_data, hue="Class")
print(temp_data.corr())                         ## >0.99 correlation

In [None]:
# plt.subplots(2,2)
sns.pairplot(temp_data[temp_data['Class']=='Çerçevelik'], palette='tab10')
sns.pairplot(temp_data[temp_data['Class']!='Çerçevelik'])

In [None]:
## Compactness, Aspect_Ration and Eccentricity seem highly correlated
temp_data = df[['Compactness', 'Aspect_Ration', 'Eccentricity','Class']]
sns.pairplot(temp_data, hue='Class')
print(temp_data.corr())

# Compactness and Aspect_ratio have -.99 corr, 
# Comp and Eccen have -.98 corr
# Eccen and Aspect have .95 corr

#### But as seen while building models, dropping any single or multiple collinear features doesnt improve performance 

## Remove Outliers ..

In [None]:
sns.boxplot(x='Eccentricity',data=df)


## Models

In [None]:
## does dropping collinear features increase accuracy??

df_ = df.drop(['Aspect_Ration'], axis=1)      ## harms performance
# 'Compactness', 'Aspect_Ration', 'Eccentricity' ; 'Convex_Area??', 'Area', Equiv_Diameter'
# remove =   nothing seems to give better preformance consistently


In [None]:
# df_ = df.drop(['Equiv_Diameter'], axis=1)        ## dropping doesnt seem to have any benefits
X = df.drop('Class',axis=1).to_numpy()
y = df["Class"].to_numpy()
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y)

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# why does scaling harm performance in l2 ..??
# but scaling required for liblinear solver

## Logistic Regression

In [None]:
# clf = LogisticRegression( C=1, random_state=460, max_iter=300)

clf = LogisticRegression(solver='liblinear', penalty='l1', C=1000, random_state=460, max_iter=500)
## as we make reg constant very large, ie C-->0 == score--> 0.52] ;  l2
## as c->> very large, ie as regularization->0 score->.88 ; .87

clf.fit(X_train, y_train)
print(clf.score(X_train, y_train), clf.score(X_test, y_test))      
# train 0.88

- l2 reg score = 0.875, .88 without scaling
- l1 reg score = 0.875 0.9056

## KNN

In [None]:
clf = KNeighborsClassifier(n_neighbors=20).fit(X_train, y_train)
print(clf.score(X_train, y_train), clf.score(X_test, y_test))   
## train 0.89

- .889 train ; .875 test 
- not much different from logistic regression

In [None]:
## encoding of labels is not required for sklearn classifiers
def encode(x):
    if x == 'Çerçevelik':
        return 0
    return 1

## SVM

In [None]:
clf = svm.SVC(kernel='poly', degree=1)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train), clf.score(X_test, y_test)) 
  
## poly 1-.873, .893 ; 2-.78, .766 ; 3-.85, .85; 4-.77, .77; 5-.80, .79...  drops off as degree increases
## rbf = .88, .88

#### why is svm giving low score sometimes.....??? also, increasing degree in poly kernel doesnt increase train accuracy.??


## Naive Bayes

In [None]:
## though not all features have gaussian distribution, that would be the best assumption..
