In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Importing Dataset 

In [49]:
df=pd.read_csv('train.csv')

In [50]:
X=df.iloc[:,1:86].values
y=df.iloc[:,86].values

In [51]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0,3,4])
X = onehotencoder.fit_transform(X).toarray()

In [57]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [32]:
df['Purchase'].value_counts()

0    2454
1     164
Name: Purchase, dtype: int64

# Dealing with imbalance dataset 

## 1. Upsampling 

In [9]:
from sklearn.utils import resample

In [218]:
df_majority=df[df.Purchase==0]
df_minority=df[df.Purchase==1]

In [219]:
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=2454)   # to match majority class

In [220]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [221]:
df_upsampled.Purchase.value_counts()

1    2454
0    2454
Name: Purchase, dtype: int64

In [222]:
# Separate input features (X) and target variable (y)
y = df_upsampled.Purchase
X = df_upsampled.drop('Purchase', axis=1)
X=X.iloc[:,1:]

In [20]:
from sklearn.metrics import accuracy_score

In [36]:
from sklearn.svm import SVC
svm_classifier= SVC(kernel='rbf', random_state=0)
svm_classifier.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
# Predict on training set
pred_y_1 = svm_classifier.predict(X_test)

In [38]:
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )

[0 1]


In [39]:
# How's our accuracy?
print( accuracy_score(y_test, pred_y_1) )

0.545454545455


In [40]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred_y_1)
print(cm)

[[22 14]
 [16 14]]


## 2.Down-sample Majority Class

In [52]:
df_majority=df[df.Purchase==0]
df_minority=df[df.Purchase==1]

In [53]:
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=164)

In [54]:
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [55]:
# Display new class counts
df_downsampled.Purchase.value_counts()

1    164
0    164
Name: Purchase, dtype: int64

In [56]:
# Separate input features (X) and target variable (y)
y = df_downsampled.Purchase
X = df_downsampled.drop('Purchase', axis=1)
X=X.iloc[:,1:]

In [30]:
# Train model
clf_2 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_2 = clf_2.predict(X)

In [31]:
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )

[0 1]


In [32]:
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )

0.762195121951


## Changing Performance Metric 

In [33]:
from sklearn.metrics import roc_auc_score

In [34]:
# Predict class probabilities
prob_y_2 = clf_2.predict_proba(X)

In [35]:
# Keep only the positive class
prob_y_2 = [p[1] for p in prob_y_2]

In [36]:
prob_y_2[:5]

[0.57947159113401558,
 0.78929834084182449,
 0.68983269489248467,
 0.27966279351001339,
 0.70008343412560181]

In [37]:
print( roc_auc_score(y, prob_y_2) )

0.849568709102


In [39]:
##Previous model
# Separate input features (X) and target variable (y)
y = df.Purchase
X = df.drop('Purchase', axis=1)
X=X.iloc[:,1:]

# Train model
clf_0 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_0 = clf_0.predict(X)

In [40]:
# How's the accuracy?
print( accuracy_score(pred_y_0, y) )

0.936974789916


In [41]:
# Should we be excited?
print( np.unique( pred_y_0 ) )

[0 1]


In [42]:
#AUROC of imbalance model
prob_y_0 = clf_0.predict_proba(X)
prob_y_0 = [p[1] for p in prob_y_0]
 
print( roc_auc_score(y, prob_y_0) )

0.80599369869


## 4. Penalize Algorithms (Cost-Sensitive Training) 

In [171]:
from sklearn.svm import SVC

In [44]:
# Separate input features (X) and target variable (y)
y = df.Purchase
X = df.drop('Purchase', axis=1)
X=X.iloc[:,1:]

In [195]:
# Train model
clf_3 = SVC(kernel='rbf', 
            class_weight='balanced', # penalize
            probability=True)

In [196]:
clf_3.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [197]:
# Predict on training set
y_pred_svm = clf_3.predict(X_test)

In [198]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_svm)
print(cm)

[[302 189]
 [ 18  15]]


In [199]:
# Is our model still predicting just one class?
print( np.unique( y_pred_svm ) )

[0 1]


In [200]:
np.count_nonzero(y_pred_svm)

204

In [201]:
# How's our accuracy?
print( accuracy_score(y_test, y_pred_svm) )

0.604961832061


## 5. Use Tree-Based Algorithms

In [51]:
from sklearn.ensemble import RandomForestClassifier

In [85]:
# Separate input features (X) and target variable (y)
y = df.Purchase
X = df.drop('Purchase', axis=1)
X=X.iloc[:,1:]

In [86]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0,3,4])
X = onehotencoder.fit_transform(X).toarray()

In [87]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Dimentionality Reduction 

In [33]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = None)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

In [34]:
print(explained_variance)

[  6.29688208e-01   8.12495281e-02   3.54146093e-02   2.83777040e-02
   2.63061786e-02   2.37820961e-02   1.75201052e-02   1.31953900e-02
   1.27713864e-02   1.24516088e-02   1.16929669e-02   1.05650608e-02
   9.27367507e-03   8.21238352e-03   6.64968469e-03   6.54950871e-03
   5.59547429e-03   5.14353487e-03   5.02296853e-03   4.44372972e-03
   4.36366911e-03   3.58955115e-03   3.50624711e-03   3.37680951e-03
   3.01924599e-03   2.75621802e-03   2.44810300e-03   2.27364672e-03
   2.02703621e-03   1.78452007e-03   1.62587557e-03   1.49161726e-03
   1.35802886e-03   1.24397983e-03   1.14671325e-03   1.09025181e-03
   1.00697008e-03   8.58359135e-04   7.50662912e-04   6.91714552e-04
   6.13056817e-04   5.59424574e-04   5.22016299e-04   4.61818611e-04
   3.89201184e-04   2.93553627e-04   2.74561356e-04   2.38625402e-04
   2.26480077e-04   2.18229786e-04   2.05716349e-04   1.96461326e-04
   1.74219110e-04   1.59054128e-04   1.56729435e-04   1.46829335e-04
   1.35512186e-04   1.32018179e-04

In [35]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = 4)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_

# Modeling 

In [89]:
# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [90]:
# Predict on training set
pred_y_4 = clf_4.predict(X_test)

In [91]:
print( np.unique( pred_y_4 ) )

[0 1]


In [92]:
print( accuracy_score(y_test, pred_y_4) )

0.925572519084


In [93]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, pred_y_4)
print(cm)

[[484   7]
 [ 32   1]]


# Predicting the Test Data

In [41]:
df_test=pd.read_csv('test.csv')

In [42]:
X_test=df_test.iloc[:,1:].values

In [45]:
X_test = onehotencoder.fit_transform(X_test).toarray()

In [47]:
X_test = pca.transform(X_test)

ValueError: operands could not be broadcast together with shapes (2553,141) (85,) 

In [236]:
y_pred_randFor = svm_classifier.predict(X_test)

In [237]:
print( np.unique(y_pred_randFor) )

[0 1]


In [238]:
np.count_nonzero(y_pred_randFor)

363

# Saving the file

In [239]:
#Saving the file
result=pd.read_csv('update.csv')
result['Purchase']=y_pred_randFor

In [240]:
result.to_csv('svm.csv', sep='\t',index=False)