In [None]:
import pandas as pd
import numpy as np

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.preprocessing import Normalizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn import metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix

from sklearn.model_selection import train_test_split, RandomizedSearchCV

from sklearn.neighbors import KNeighborsClassifier

In [None]:
from scipy.sparse import hstack

In [None]:
import nltk

In [None]:
import plotly.graph_objects as go
from plotly.offline import offline
offline.init_notebook_mode()

In [None]:
import os
from tqdm import tqdm
import re
import pickle
from collections import Counter

In [None]:
def _print(statement, arguments, do_print = True):
    
    if do_print:
        print(statement.format(*arguments))
    
    return

In [None]:
def print_lb(character, num = 60):

    print(character*num)
    
    return

# [1] K Nearest Neighbor

## [1.1] Loading Data

In [None]:
data = pd.read_csv("./data/preprocessed_data.csv", nrows = 5000)
_print("Top 3 rows of DataFrame : \n{}", [data.head(n = 3)])

In [None]:
columns = data.columns.values
_print("DataFrame columns: {}", [columns], True)

In [None]:
Y = data['project_is_approved'].values
X = data.drop(labels = 'project_is_approved', axis = 1, inplace = False)

In [None]:
_print("1. Type of X part of the Data: \n{}\n", [type(X)])
_print("2. Shape of X part of the Data: \n{}\n", [X.shape])
_print("3. X part of the Data: \n{}", [X.head(n = 3)])

In [None]:
_print("1. Type of Y part of the Data: \n{}\n", [type(Y)])
_print("2. Shape of Y part of the Data: \n{}\n", [Y.shape])
_print("3. Y part of the Data: \n{}", [Y[:5]])

## [1.2] Splitting data into Train and Cross Validation: Startified Sampling

In [None]:
D_Train, d_test, Y_Train, y_test = train_test_split(X, Y, test_size = 0.3, stratify = Y)

In [None]:
d_train, d_cv, y_train, y_cv = train_test_split(D_Train, Y_Train, test_size = 0.3, stratify = Y_Train)

In [None]:
_print('d_train type: {}', [type(d_train)])
_print('d_train shape: {}', [d_train.shape])
_print('y_train shape: {}', [y_train.shape])


In [None]:
_print('d_cv type: {}', [type(d_cv)])
_print('d_cv shape: {}', [d_cv.shape])
_print('y_cv shape: {}', [y_cv.shape])

In [None]:
_print('d_test type: {}', [type(d_test)])
_print('d_test shape: {}', [d_test.shape])
_print('y_test shape: {}', [y_test.shape])

## [1.3] Make Model Data Ready: encoding essay, and project title

In [None]:
ex = ["classroom students should take break",
"low income homes students receive free breakfast",
"students will receive low grades"]

In [None]:
count_vectorizer = CountVectorizer()

count_vectorizer.fit(ex)
vectorizer = count_vectorizer.transform(ex)

_print("Data Type of vectorizer: {}\n\n", [type(vectorizer)])
_print("Sparse Matrix: \n{}",[vectorizer.toarray()])

In [None]:
_print("Feature names: \n{}", [count_vectorizer.get_feature_names()])

In [None]:
count_vectorizer = CountVectorizer(min_df = 10, ngram_range = (1, 4), max_features = 5000)
count_vectorizer.fit(d_train['essay'].values)

> After vectorization shape of d_train_essay_bow, d_cv_essay_bow, d_test_essay_bow

In [None]:
d_train_essay_bow = count_vectorizer.transform(d_train['essay'].values)
_print("Shape of d_train_essay_bow: {}\n", [d_train_essay_bow.shape])

In [None]:
d_cv_essay_bow = count_vectorizer.transform(d_cv['essay'].values)
_print("Shape of d_cv_essay_bow: {}\n", [d_cv_essay_bow.shape])


In [None]:
d_test_essay_bow = count_vectorizer.transform(d_test['essay'].values)
_print("Shape of d_test_essay_bow: {}\n", [d_test_essay_bow.shape])

## [1.4] Make model data ready: Encoding numerical and categorical feature

In [None]:
"""
[
 'school_state' 'teacher_prefix' 'project_grade_category'
 'teacher_number_of_previously_posted_projects' 'project_is_approved'
 'clean_categories' 'clean_subcategories' 'essay' 'price'
 ]
 'school_state' 'teacher_prefix' 'project_grade_category', 'clean_categories' 'clean_subcategories'
"""

### [1.4.1] Encoding categorical feature: school_state

In [None]:
"""
# Here we are encoding school_state columns using parameter binary = False
# count_vectorizer_ss: Count Vectorizer school state
count_vectorizer_ss = CountVectorizer(binary = True)

count_vectorizer_ss.fit(d_train['school_state'])

school_state_ohe = count_vectorizer_ss.transform(d_train['school_state'])
_print("Type of school_state_ohe: {}\n", [type(school_state_ohe)])

school_state_ohe_array = school_state_ohe.toarray()
_print("Size of school_state_ohe: {}\n", [school_state_ohe_array.size])
_print("school_state_ohe_array values: \n{}", [school_state_ohe_array])

"""

"""
Type of school_state_ohe: <class 'scipy.sparse.csr.csr_matrix'>

Size of school_state_ohe: 122500

school_state_ohe_array values: 
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
 """

In [None]:
count_vectorizer_ss = CountVectorizer(binary = False)

In [None]:
school_state_values = d_train['school_state'].values
count_vectorizer_ss.fit(school_state_values)

d_train_school_state_ohe = count_vectorizer_ss.transform(school_state_values)

d_train_school_state_ohe = count_vectorizer_ss.transform(d_train['school_state'])
d_cv_school_state_ohe = count_vectorizer_ss.transform(d_cv['school_state'])
d_test_school_state_ohe = count_vectorizer_ss.transform(d_test['school_state'])

_print("Type of d_train_school_state_ohe: {}\n", [type(d_train_school_state_ohe)])
_print("Shape of d_train_school_state_ohe: {}\n", [d_train_school_state_ohe.shape])
_print("Shape of d_cv_school_state_ohe: {}\n", [d_cv_school_state_ohe.shape])
_print("Shape of d_test_school_state_ohe: {}\n", [d_test_school_state_ohe.shape])

### [1.4.2] Encoding categorical feature: Teacher prefix

In [None]:
teacher_prefix_values = d_train['teacher_prefix'].values
count_vectorizer_tp = CountVectorizer(binary = False)

count_vectorizer_tp.fit(teacher_prefix_values)

d_train_tp_ohe = count_vectorizer_tp.transform(teacher_prefix_values)
d_cv_tp_ohe = count_vectorizer_tp.transform(d_cv['teacher_prefix'].values)
d_test_tp_ohe = count_vectorizer_tp.transform(d_test['teacher_prefix'].values)

_print("Type of d_train_tp_ohe: {}\n", [type(d_train_tp_ohe)])
_print("Shape of d_train_tp_ohe: {} | y_train: {}\n", [d_train_tp_ohe.shape, y_train.shape])
_print("Shape of d_cv_tp_ohe: {} | y_cv: {}\n", [d_cv_tp_ohe.shape, y_cv.shape])
_print("Shape of d_test_tp_ohe: {} | d_test: {}\n", [d_test_tp_ohe.shape, y_test.shape])


### [1.4.3] Encoding categorical feature: project_grade_category

In [None]:
project_grade_category_values = d_train['project_grade_category'].values

count_vectorizer_pgc = CountVectorizer()
count_vectorizer_pgc.fit(project_grade_category_values)

In [None]:
d_train_pgc_ohe = count_vectorizer_pgc.transform(project_grade_category_values)
d_cv_pgc_ohe = count_vectorizer_pgc.transform(d_cv['project_grade_category'].values)
d_test_pgc_ohe = count_vectorizer_pgc.transform(d_test['project_grade_category'].values)

_print("Type of d_train_pgc_ohe: {}\n", [type(d_train_pgc_ohe)])
_print("Shape of d_train_pgc_ohe: {} | y_train: {}\n", [d_train_pgc_ohe.shape, y_train.shape])
_print("Shape of d_cv_pgc_ohe: {} | y_cv: {}\n", [d_cv_pgc_ohe.shape, y_cv.shape])
_print("Shape of d_test_pgc_ohe: {} | d_test: {}\n", [d_test_pgc_ohe.shape, y_test.shape])

### [1.4.4] Encoding numerical feature: price

In [None]:
normalizer = Normalizer()

In [None]:
price_values = d_train['price'].values
_print("Type of price_values: {}", [type(price_values)])
_print("Size of price_values: {}", [price_values.size])
_print("Initial 5 values of price_values: {}", [price_values[:5]])

In [None]:
normalizer.fit(price_values.reshape(1, -1))

In [None]:
d_train_price_en = normalizer.transform(price_values.reshape(1, -1))
d_cv_price_en = normalizer.transform(d_cv['price'].values.reshape(1, -1))
d_test_price_en = normalizer.transform(d_test['price'].values.reshape(1, -1))

_print("1.Type of d_train_price_en: {}", [type(d_train_price_en)])
_print("1.1 Shape of d_train_price_en: {}", [d_train_price_en.shape])
_print("1.3 Data insigh of d_train_price_en: {}\n", [d_train_price_en[:3]])
_print("2. Shape of d_cv_price_en: {}\n", [d_cv_price_en.shape])
_print("3. Shape of d_test_price_en: {}\n", [d_test_price_en.shape])

## [1.4.5] Concatinating all the features

In [None]:
"""
d_train_school_state_ohe = count_vectorizer_ss.transform(d_train['school_state'])
d_cv_school_state_ohe = count_vectorizer_ss.transform(d_cv['school_state'])
d_test_school_state_ohe = count_vectorizer_ss.transform(d_test['school_state'])


d_train_price_en = normalizer.transform(price_values.reshape(1, -1))
d_cv_price_en = normalizer.transform(d_cv['price'].values.reshape(1, -1))
d_test_price_en = normalizer.transform(d_test['price'].values.reshape(1, -1))
"""

In [None]:
D_TRAIN = hstack((d_train_school_state_ohe, d_train_price_en.reshape(-1, 1))).tocsr()
_print("Shape of D_TRAIN: {}",[D_TRAIN.shape])
_print("Data type of D_TRAIN: {}",[type(D_TRAIN)])
_print("Data insight of D_TRAIN: {}",[D_TRAIN[:5]])

In [None]:
D_CV = hstack((d_cv_school_state_ohe, d_cv_price_en.reshape(-1, 1))).tocsr()
print("Shape of D_CV: {}",[D_CV.shape])

In [None]:
D_TEST = hstack((d_test_school_state_ohe, d_test_price_en.reshape(-1, 1))).tocsr()
print("Shape of D_TEST: {}",[D_TEST.shape])

## [1.5] Applying KNN on different kind of featurization

### [1.5.1] Applying KNN : B.O.W featurization

#### [1.5.1.1] Hyperparameter tuning

##### [1.5.1.1] Method 1

In [None]:
def batch_predict(clfr, data, batch_size = 1000):
    
    quotient_num, remainder_num = np.divmod(data.shape[0], batch_size)
    loop_upper_limit = data.shape[0] - remainder_num
    
    y_hat = list()
    for i in range(0, loop_upper_limit, batch_size):

        _y_hat = clfr.predict_proba(D_TRAIN[i:i+batch_size])[:, 1]
        y_hat.extend(_y_hat)

    if remainder_num != 0:
        _y_hat = clfr.predict_proba(D_TRAIN[i:i+remainder_num])[:, 1]
        y_hat.extend(_y_hat)
        
    return y_hat

In [None]:
clfr = KNeighborsClassifier()

In [None]:
neighbors = [3, 5, 10]
train_auc, cv_auc = list(), list()
for neighbor in neighbors:
    
    
    clfr.n_neighbors = neighbor
    clfr.fit(D_TRAIN, y_train)
    
    y_train_hat = batch_predict(clfr, D_TRAIN)
    y_train_score = roc_auc_score(y_train, y_train_hat)
    train_auc.append(y_train_score)
    
    y_cv_hat = batch_predict(clfr, D_CV) 
    y_cv_score = roc_auc_score(y_cv, y_cv_hat)
    cv_auc.append(y_cv_score)
print(train_auc, cv_auc)

##### [1.5.1.2] Method 2

In [59]:
neigh = KNeighborsClassifier(n_jobs= -1)
parameters = {'n_neighbors': [3, 5, 7, 11]}
clsfr = RandomizedSearchCV(neigh, parameters, cv = 3, scoring = 'roc_auc', return_train_score = True)
clsfr.fit(D_TRAIN, y_train)


The total space of parameters 4 is smaller than n_iter=10. Running 4 iterations. For exhaustive searches, use GridSearchCV.



RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None, n_jobs=-1,
                                                  n_neighbors=5, p=2,
                                                  weights='uniform'),
                   iid='deprecated', n_iter=10, n_jobs=None,
                   param_distributions={'n_neighbors': [3, 5, 7, 11]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=True, scoring='roc_auc', verbose=0)

> After Classifier fits data, lets observe RandomizedSearchCV result

In [61]:
results_1 = pd.DataFrame(clsfr.cv_results_)
_print("DataFrame ans result: \n{}", [ans])

DataFrame ans result: 
   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.001330  4.699094e-04         0.137769        0.011231   
1       0.000998  8.920806e-07         0.139879        0.008968   
2       0.000997  2.239387e-06         0.135729        0.000856   
3       0.000995  4.753779e-06         0.153590        0.014662   

  param_n_neighbors               params  split0_test_score  \
0                 3   {'n_neighbors': 3}           0.504783   
1                 5   {'n_neighbors': 5}           0.493635   
2                 7   {'n_neighbors': 7}           0.516298   
3                11  {'n_neighbors': 11}           0.521184   

   split1_test_score  split2_test_score  mean_test_score  std_test_score  \
0           0.519622           0.518083         0.514163        0.006662   
1           0.508127           0.501719         0.501160        0.005929   
2           0.505259           0.502238         0.507932        0.006043   
3           0.489289 

In [62]:
train_roc_auc_mean = results_1['mean_train_score']
_print("Mean Train ROC AUC score: \n{}", [train_roc_auc_mean])
print_lb("==")

train_roc_auc_std = results_1['std_train_score']
_print("\nStd Train ROC AUC score: \n{}\n", [train_roc_auc_std])
print_lb("==")

cv_roc_auc_mean = results_1['mean_test_score']
_print("\nMean CV ROC AUC score: \n{}\n", [cv_roc_auc_mean])
print_lb("==")

cv_roc_auc_std = results_1['std_test_score']
_print("\nStd CV ROC AUC score: \n{}\n", [cv_roc_auc_std])

Mean Train ROC AUC score: 
0    0.898077
1    0.836498
2    0.800105
3    0.745515
Name: mean_train_score, dtype: float64

Std Train ROC AUC score: 
0    0.004538
1    0.003839
2    0.006043
3    0.001846
Name: std_train_score, dtype: float64


Mean CV ROC AUC score: 
0    0.514163
1    0.501160
2    0.507932
3    0.504136
Name: mean_test_score, dtype: float64


Std CV ROC AUC score: 
0    0.006662
1    0.005929
2    0.006043
3    0.013114
Name: std_test_score, dtype: float64

