In [1]:
import pandas as pd
import numpy as np
import spacy
import contractions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
#Sklearn Libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
df = pd.read_csv('dataset.csv')

In [5]:
df.columns

Index(['sentence', 'label'], dtype='object')

I'm going to approach this problem as binary classification

In [6]:
df['label'] = np.where(df.label == 'Student has shared', 1, 0)

In [7]:
round(df.label.value_counts(normalize=True),2)*100

1    70.0
0    30.0
Name: label, dtype: float64

Only 1/3 of sentences have label 0 (_Student wants to know if can share_)

In [8]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [9]:
#Dependent Variable
X = df.sentence
#Independent Variables
y = df.label
#Splitting for checking the performance of the models on a holdout dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle = True)

In [10]:
#Function for basic cleaning/preprocessing texts
def clean(sentence):
    expanded_words = []
    for word in sentence.split():
        expanded_words.append(contractions.fix(word))
    expanded_text = ' '.join(expanded_words)
    doc = nlp(expanded_text)
    return " ".join([token.lemma_ for token in doc])

In [11]:
#vectorizer = CountVectorizer(max_features= 5000, preprocessor=clean, ngram_range=(1,2))
vectorizer = CountVectorizer(lowercase=True, ngram_range=(1,2))

In [12]:
#text_transformer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150)
vectorizer = TfidfVectorizer(lowercase=True)

In [13]:
X_train_vectorized = vectorizer.fit_transform(X_train)

In [14]:
X_val_vectorized = vectorizer.transform(X_val)

In [15]:
X_train_vectorized.shape, X_val_vectorized.shape

((17, 28), (6, 28))

In [16]:
vectorizer.get_feature_names_out()

array(['address', 'all', 'already', 'been', 'can', 'card', 'contact',
       'contacts', 'could', 'did', 'digits', 'email', 'friend', 'friends',
       'has', 'have', 'ive', 'just', 'may', 'might', 'my', 'okay',
       'share', 'shared', 'the', 'we', 'with', 'your'], dtype=object)

In [17]:
pd.DataFrame(X_train_vectorized.toarray(), columns = vectorizer.get_feature_names_out())

Unnamed: 0,address,all,already,been,can,card,contact,contacts,could,did,...,may,might,my,okay,share,shared,the,we,with,your
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.390952,0.0,0.0,0.0,0.328881
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.771867,0.0,0.0,0.469427,0.0,0.0,0.0,0.0,0.285433
2,0.0,0.404332,0.0,0.404332,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.316674,0.0,0.0,0.17774,0.353055,0.0,0.316674,0.0
3,0.563756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.347375,0.0,0.0,0.0,0.292223
4,0.313555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.439515,0.0,...,0.0,0.0,0.34423,0.0,0.267301,0.0,0.0,0.439515,0.34423,0.162531
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.62053,0.0,0.0,0.0,0.52201
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.588237,0.0,0.258582,0.513638,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.867114,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.381174,0.0,0.0,0.0,0.320655
8,0.0,0.0,0.0,0.0,0.455938,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.408955,0.0,0.317561,0.0,0.0,0.0,0.408955,0.193092
9,0.0,0.0,0.0,0.0,0.0,0.655123,0.655123,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.287985,0.0,0.0,0.0,0.242262


In [51]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

def ml_pipe(X, y):
    # Perform Grid-Search
    gsc = GridSearchCV(
        estimator=RandomForestClassifier(),
        param_grid={
            'n_estimators': (10, 50, 100, 1000),
        },
        cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_result = gsc.fit(X, y)
    best_params = grid_result.best_params_
    
    model = RandomForestClassifier(n_estimators=best_params["n_estimators"], random_state=123)
    print('\nBest n_estimator param -> ' + str(best_params["n_estimators"]))
    
    return model

In [49]:
rf_clf = ml_pipe(X_train_vectorized, y_train)
cv_result_accuracy = rfclf.mean()


scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')

print("Mean score of %0.2f with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best n_estimator param -> 50


In [None]:
# Perform K-Fold CV
scores = cross_val_score(rf_clf, X_train_vectorized, y_train, cv=5, scoring='accuracy')

In [134]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [135]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, random_state=123)


numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=True, handle_unknown='ignore')),
    ('tsvd', TruncatedSVD(n_components=1, algorithm='arpack', tol=1e-4))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = Pipeline(steps=[
    ('precprocessor', preprocessor),
    ('classifier', classifier)
])


In [129]:
model_transformer = Pipeline([
    ('vect', CountVectorizer(lowercase=True, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
])

In [130]:
t = model_transformer.fit(X_train,y_train)

In [146]:
t.transform(['I shared email with you']).toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.25471119, 0.        , 0.        , 0.        , 0.57943017,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.24020512, 0.57943017, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.45381172, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [191]:
pipeline = Pipeline([
    ('vect', CountVectorizer(lowercase=True, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(random_state=123))
])

# this is where you define the values for
# GridSearchCV to iterate over
param_grid = {
    'clf__n_estimators':[10, 50, 100, 1000]
}

# do 3-fold cross validation for each of the 6 possible
# combinations of the parameter values above
grid = GridSearchCV(pipeline, cv=4, param_grid=param_grid)
grid.fit(X_train,y_train)

# summarize results
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
    
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.887500 using {'clf__n_estimators': 10}
0.887500 (0.113880) with: {'clf__n_estimators': 10}
0.887500 (0.113880) with: {'clf__n_estimators': 50}
0.825000 (0.103078) with: {'clf__n_estimators': 100}
0.825000 (0.103078) with: {'clf__n_estimators': 1000}


In [195]:
import joblib

In [196]:
joblib.dump(pipeline, 'test.pkl')

['test.pkl']

In [193]:
# with the information above, you can be more 
# comfortable to train on the whole dataset
model = pipeline.fit(X_train,y_train)

y_preds = model.predict(X_test)

In [194]:
model.score(X_test, y_test)

0.8333333333333334

In [187]:
y_preds

array([1, 0, 0, 0, 1, 0])

In [188]:
X_test

5                          Ive just shared your address
19    Could we share your email address with my friends
20                Can I share your email with my friend
18                             Might I share your email
14                 I have sent this email to my friends
8                                I did share your email
Name: sentence, dtype: object

In [159]:
from sklearn.metrics import confusion_matrix
 
cm = confusion_matrix(y_test, y_preds)
cm

array([[3, 0],
       [1, 2]])

In [160]:
pipeline[0].transform(["Tets"]).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]])

In [166]:
X_test,y_test

(5                          Ive just shared your address
 19    Could we share your email address with my friends
 20                Can I share your email with my friend
 18                             Might I share your email
 14                 I have sent this email to my friends
 8                                I did share your email
 Name: sentence, dtype: object,
 5     1
 19    0
 20    0
 18    0
 14    1
 8     1
 Name: label, dtype: int64)

In [169]:
model.predict(['Could we share your meial address'])

array([0])

In [117]:
pipeline.fit_transform

AttributeError: This 'Pipeline' has no attribute 'transform'

In [114]:
from sklearn.metrics import f1_score
f1_score(y_test, y_preds, average='micro')

0.8333333333333334

In [None]:
scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')

print("Mean score of %0.2f with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [None]:
 
# n_estimators can be said as number of
# trees, experiment with n_estimators
# to get better results
model = RandomForestClassifier(n_estimators = 500, criterion = 'entropy', class_weight='balanced')

In [None]:
pd.DataFrame(np.reshape(model.feature_importances_, (1, 31)), columns = vectorizer.get_feature_names_out())

In [None]:
model.fit(X_train_vectorized, y_train)

In [None]:
y_pred = model.predict(X_val_vectorized)

In [None]:
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
 
cm = confusion_matrix(y_val, y_pred)
 
cm

In [None]:
model.score(X_val_vectorized, y_val)

In [None]:
model.predict(vectorizer.transform(["Could we share email"]))

In [None]:
# Return the mean accuracy on the given test data and labels.