In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier

import helper
import text_embeddings
import performance
import models
import predict

# 1. Load data

In [2]:
x, y = helper.load_data()

# 2. Split data into train/test

In [3]:
x_train, x_test, y_train, y_test = helper.split_data(x, y)

# 3. Text embedding

In [4]:
x_train_features, x_test_features, vectorizer = text_embeddings.encode_tdfif(x_train, x_test, 'text')

# 4. Grid Search using 

-  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier
-  https://scikit-learn.org/stable/modules/kernel_approximation.html  

In [5]:
final_x_train = x_train_features['text']['data']
final_y_train = y_train

final_x_test = x_test_features['text']['data']
final_y_test = y_test

In [6]:
model_params = dict(eta0=0.0001,loss='hinge', random_state=15, penalty='l2', tol=1e-3, verbose=0)
model_class = SGDClassifier

In [None]:
%%time
import matplotlib.pyplot as plt
# from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint


alpha_range = [ 0.0001,0.001,0.01,0.1,1,2,3,4,5,6,7,8,9,10,50,100]

model = model_class(**model_params)

search_params = {'alpha' : alpha_range}
search = GridSearchCV(model,
                      search_params,
                      cv=5,
                      scoring="f1",
                     return_train_score=True)


search.fit(final_x_train, final_y_train)
results = pd.DataFrame.from_dict(search.cv_results_)
results = results.sort_values(['param_alpha'])
results['mean_train_score-mean_test_score'] = results['mean_train_score'] - results['mean_test_score']
results[['param_alpha','mean_train_score','mean_test_score', 'mean_train_score-mean_test_score']]

In [None]:
# Train
plt.plot(results['param_alpha'], results['mean_train_score'], label="Train AUC")
plt.scatter(results['param_alpha'], results['mean_train_score'], label="Train AUC points")

# Validation
plt.plot(results['param_alpha'], results['mean_test_score'], label="Validation AUC")
plt.scatter(results['param_alpha'], results['mean_test_score'], label="Validation AUC points")

plt.xlabel("Alpha: hyperparameter")
plt.ylabel("AUC")
plt.title("AUC vs alpha curves")
plt.legend()
plt.show()

# 5. Prediction

In [7]:
alpha = 0.0001
final_model = model_class(alpha=alpha, **model_params)
final_model.fit(final_x_train, final_y_train)

final_y_train_pred = final_model.predict(final_x_train)
final_y_test_pred = final_model.predict(final_x_test)


# 7. Performance Measures

In [None]:
performance_measures = performance.get_performance_measures(final_model, final_x_test, final_y_test)
performance_measures

# Test

In [8]:
predict.manual_test(final_model, 
                         model_id="SVM",
                         vectorizer=vectorizer)

Unnamed: 0,Text,is_depressed(expected),is_depressed(model output),model output probability (if any)
0,I love my life,0,0,
1,I hate my life,1,1,
2,Nothing ever goes right for me.,1,1,
3,Why does everything bad happen with me?,1,1,
4,Today was such a good day!,0,0,
5,Wow. I had such an amazing time,0,0,
6,Loving how me and my lovely partner is talking...,0,0,
7,Happy Thursday everyone. Thought today was Wed...,0,0,
8,It’s the little things that make me smile. Got...,0,0,
9,Lately I have been feeling unsure of myself as...,1,1,


# Future Work
- Explore string kernel https://github.com/timshenkao/StringKernelSVM
- word2vec using SVM (https://shop.tarjomeplus.com/UploadFileEn/TPLUS_EN_3959.pdf)
- https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/