### ML End-to-end
 1. Load and engineer dataset
 2. EDA
 3. Modelling
 4. Validation
 5. Export model

In [1]:
import pandas as pd
import numpy as np
import spacy
import contractions

In [15]:
# Sklearn Libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix

# Save pipeline as pickle
import joblib

In [3]:
df = pd.read_csv('data/dataset.csv')

In [4]:
df.columns

Index(['sentence', 'label'], dtype='object')

In [5]:
df.head(5)

Unnamed: 0,sentence,label
0,I shared your email,Student has shared
1,I just shared your address,Student has shared
2,Ive sent your email address to my friend,Student has shared
3,Ive shared your email,Student has shared
4,I already shared email,Student has shared


I'm going to approach this problem as binary classification

In [6]:
df['label'] = np.where(df.label == 'Student has shared', 1, 0)

In [7]:
df.head(5)

Unnamed: 0,sentence,label
0,I shared your email,1
1,I just shared your address,1
2,Ive sent your email address to my friend,1
3,Ive shared your email,1
4,I already shared email,1


In [8]:
round(df.label.value_counts(normalize=True),2)*100

1    70.0
0    30.0
Name: label, dtype: float64

Only 1/3 of sentences have label 0 (_Student wants to know if can share_)

In [9]:
# Load spacy model for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [20]:
# Lemmatization was used initially as a preprocesser for CountVectorizer() but ultimately it was excluded.
# It simplifying the dataset too much and removing word relationships.
# Would work better for a more complex dataset.

def clean_lemmatization(sentence):
    expanded_words = []
    for word in sentence.split():
        expanded_words.append(contractions.fix(word))
    expanded_text = ' '.join(expanded_words)
    doc = nlp(expanded_text)
    return " ".join([token.lemma_ for token in doc])

In [12]:
#Dependent Variable
X = df.sentence

#Independent Variable
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, random_state=123)

In [39]:
pipeline = Pipeline([
    ('vect', CountVectorizer(lowercase=True, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(class_weight='balanced', random_state=123))
])

# Grid search parameters for RF Classifier
param_grid = {
    'clf__n_estimators':[10, 50, 100, 1000]
}

grid = GridSearchCV(pipeline, cv=4, param_grid=param_grid)
grid.fit(X_train,y_train)

print("Best parameters: %0.4f using %s" % (grid.best_score_, grid.best_params_))
    
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("Mean score of %0.4f with a standard deviation of %0.2f | Parameters: %r" % (mean, stdev, param))

Best parameters: 0.8875 using {'clf__n_estimators': 100}
Mean score of 0.8250 with a standard deviation of 0.10 | Parameters: {'clf__n_estimators': 10}
Mean score of 0.8250 with a standard deviation of 0.10 | Parameters: {'clf__n_estimators': 50}
Mean score of 0.8875 with a standard deviation of 0.11 | Parameters: {'clf__n_estimators': 100}
Mean score of 0.8250 with a standard deviation of 0.10 | Parameters: {'clf__n_estimators': 1000}


In [24]:
best_model = grid.best_estimator_

In [27]:
y_preds = best_model.predict(X_test)

In [34]:
best_model.score(X_test, y_test)

0.8333333333333334

In [33]:
f1_score(y_test, y_preds)

0.8571428571428571

In [28]:
confusion_matrix(y_test, y_preds)

array([[2, 1],
       [0, 3]])

In [None]:
joblib.dump(best_model, 'test.pkl')