In [4]:
import pandas as pd
from sklearn.model_selection import KFold 
from sklearn.metrics import confusion_matrix
from sklearn import svm

uk_clocks = pd.read_csv('uk_clocks.csv')

In [30]:
predictors = ['listingIsTopRated', 'sellerFeedbackScore', 'sellerPositivePercent', 'sellerIsTopRated', 'endAtWeekend', 'endAtEvening', 'length', 'isBroken', 'isUsed', 'isLarge', 'freeShipping']

X = uk_clocks.loc[:,predictors]
y = uk_clocks.loc[:,'isSold']

kf = KFold(n_splits=5, shuffle=True)
model = svm.SVC(kernel='linear')


In [27]:
y.value_counts()

0    2423
1     440
Name: isSold, dtype: int64

In [31]:
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(y_test.value_counts())
    
    print(confusion_matrix(y_test, y_pred))

0    489
1     84
Name: isSold, dtype: int64
[[452  37]
 [ 65  19]]
0    481
1     92
Name: isSold, dtype: int64
[[459  22]
 [ 80  12]]
0    477
1     96
Name: isSold, dtype: int64
[[452  25]
 [ 84  12]]
0    487
1     85
Name: isSold, dtype: int64
[[487   0]
 [ 83   2]]
0    489
1     83
Name: isSold, dtype: int64
[[488   1]
 [ 83   0]]


In [168]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import cohen_kappa_score
import xgboost as xgb
from sklearn.utils import resample, shuffle

model2 = DecisionTreeClassifier(random_state=0)
# model2 = GradientBoostingClassifier(n_estimators=200, random_state=0, max_depth=20)
# model2 = LogisticRegression(random_state=0, class_weight='balanced')
# model2 = GaussianNB()
# model2 = xgb.XGBClassifier(objective="binary:logistic", eval_metric='error', use_label_encoder=False)

for train_index , test_index in kf.split(X):
    
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
    
    maj_class = X_train[y_train == 0]
    min_class = X_train[y_train == 1]
    count = y_train.value_counts()[0]
    
    min_class_upsampled = resample(min_class, random_state=0, n_samples=count, replace=True)
    X_upsampled = pd.concat([maj_class,min_class_upsampled])
    y_upsampled = pd.Series([0]*count+[1]*count)
    
    model2.fit(X_train, y_train)
#     model2.fit(X_upsampled,y_upsampled)
    y_pred = model2.predict(X_test)
#     print(y_test.value_counts())
#     print(model2.score(X,y))
    
    print(cohen_kappa_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

0.18274672187715657
[[421  62]
 [ 62  28]]
0.19862215430835772
[[441  53]
 [ 55  24]]
0.17929865603590267
[[427  57]
 [ 63  26]]
0.19612947817071769
[[421  66]
 [ 56  29]]
0.23943085735066827
[[422  53]
 [ 64  33]]
