In [None]:
# Uncomment to download the dataset

#!wget "http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip" -O data.zip
#!unzip data.zip

In [None]:
!pip install nltk wordcloud joblib

In [None]:
# for regular expressions
import re 
from sklearn import re

# for text manipulation
import nltk 
import string 
import warnings

# for data manipulation
import numpy as np 
import pandas as pd 

# for data visualization
import matplotlib.pyplot as plt
import seaborn as sns


#importing different libraries for analysis, processing and classification
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk import FreqDist
from nltk.tokenize import word_tokenize 

from wordcloud import WordCloud, STOPWORDS
from math import log, sqrt

# vectorizer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier #classification model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

# performance evaluation criteria 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score  

import joblib
import pickle

%matplotlib inline

In [None]:
pd.set_option("display.max_colwidth", 200) 
warnings.filterwarnings("ignore") #ignore warnings

In [None]:
nltk.download('stopwords')        
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
def load_and_clean(filename, encoding=None):
  # Loading dataset from csv
  data = pd.read_csv(filename, encoding=encoding)
  
  # Setting custom columns from the dataset
  DATASET_COLUMNS = ["label", "ids", "date", "flag", "user", "message"]
  data.columns = DATASET_COLUMNS

  # Dropping irrelevant columns
  data.drop(['ids','date','flag','user'], axis = 1, inplace = True)
  return data

In [None]:
def load_chunks(data, start=0, end=10000):
  positive = data[data.label == 4].iloc[start:end, :]
  negative = data[data.label == 0].iloc[start:end, :]
  return pd.concat([ positive, negative ])

In [None]:
def normalizer(tweet, sw=True, stemmer=False, tfidf=None, vsw=None):
    # Removing usernames
    tweets = " ".join(filter(lambda x: x[0] != '@' , tweet.split()))
    
    # Removing URLs
    tweets = re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', tweets, flags=re.MULTILINE)
    
    # Removing everything but letters
    tweets = re.sub("^a-zA-Z", "", tweets)
    
    # Removing special HTML5 characters
    tweets = tweets.replace("&lt;", "")
    tweets = tweets.replace("&amp;", "")
    tweets = tweets.replace("&quot;", "")
    
    # Tokeninzing words
    tweets = tweets.lower()
    tweets = tweets.split()

    if sw:
      tweets = [word for word in tweets if not word in set(stopwords.words('english'))]
    
    if not stemmer:
      lemma = WordNetLemmatizer()
      tweets = [lemma.lemmatize(word) for word in tweets]
    
    if stemmer:
      stemmer = SnowballStemmer('english')
      tweets = [stemmer.stem(word) for word in tweets]
    
    # Forming the sentence again
    tweets = " ".join(tweets)
    return tweets

In [None]:
def create_vectorizer(data, tfidf=True, vsw=False, sw=None, stemmer=None):
  pvectorizer = None
  
  if tfidf:
    pvectorizer = TfidfVectorizer(stop_words='english' if vsw else None)

  if not tfidf:
    pvectorizer = CountVectorizer(stop_words='english' if vsw else None)

  pvectorizer.fit_transform(data['message'])

  save_vectorizer(pvectorizer, { 'tfidf':tfidf, 'vsw':vsw, 'sw':sw, 'stemmer':stemmer })

In [None]:
def vectorizer(message, tfidf=True, vsw=False, sw=None, stemmer=None):
  pvectorizer = load_vectorizer({ 'tfidf':tfidf, 'vsw':vsw, 'sw':sw, 'stemmer':stemmer })

  cv = pvectorizer.transform(message)

  return cv

In [None]:
def normalize_and_vectorize(data, **kwargs):

  data.message = data.message.apply(lambda tweet: normalizer(tweet, **kwargs))

  cv = vectorizer(data.message, **kwargs)

  return cv

In [None]:
def load_name_from_configs(configs):
  return [
    "STM" if configs['stemmer'] else "LEM",
    "TFI" if configs['tfidf'] else "BOW",
    "STW" if configs['sw'] else "NSW",
    "VSW" if configs['vsw'] else "NVS"
  ]

In [None]:
def load_vectorizer(configs):
  try:
    name = load_name_from_configs(configs)
    return joblib.load("VCT.{}-{}-{}-{}.pkl".format(*name))
  except Exception as e: pass
  return None

In [None]:
def save_vectorizer(cv, configs):
  try:
    name = load_name_from_configs(configs)
    joblib.dump(cv, "VCT.{}-{}-{}-{}.pkl".format(*name))
  except Exception as e:
    pass

In [None]:
def load_models(configs):
  try:
    name = load_name_from_configs(configs)
    lr = joblib.load("LGR.{}-{}-{}-{}.model".format(*name))
    mnb = joblib.load("MNB.{}-{}-{}-{}.model".format(*name))
    svc = joblib.load("SVC.{}-{}-{}-{}.model".format(*name))
    sgd = joblib.load("SGD.{}-{}-{}-{}.model".format(*name))
    return (lr, mnb, svc, sgd)
  except Exception as e: pass
  return (None, None, None, None)

In [None]:
def save_models(configs, lr, mnb, svc, sgd):
  try:
    name = load_name_from_configs(configs)
    joblib.dump(lr, "LGR.{}-{}-{}-{}.model".format(*name))
    joblib.dump(mnb, "MNB.{}-{}-{}-{}.model".format(*name))
    joblib.dump(svc, "SVC.{}-{}-{}-{}.model".format(*name))
    joblib.dump(sgd, "SGD.{}-{}-{}-{}.model".format(*name))
  except Exception as e:
    pass

In [None]:
def train_test_save(cv, label, start, end, configs):
  X_train,X_test,y_train,y_test = train_test_split(cv, label , test_size=.2, stratify=label, random_state=6174)
  
  lr, mnb, svc, sgd = load_models(configs)

  # Logistic Regression
  if lr == None:
    lr = LogisticRegression()
  lr.fit(X_train,y_train)
  predictions_lr = lr.predict(X_test)
  ac_lr = accuracy_score(y_test, predictions_lr)
  print("LR: ", ac_lr)
  
  # Multinomial Naive Bayes
  if mnb == None:
    mnb = MultinomialNB()
  mnb.fit(X_train,y_train)
  predictions_mnb = mnb.predict(X_test)
  ac_mnb = accuracy_score(y_test, predictions_mnb)
  print("MNB: ", ac_mnb)

  # Support Vector Machine
  if svc == None:
    svc = SVC()
  svc.fit(X_train,y_train)
  predictions_svc = svc.predict(X_test)
  ac_svc = accuracy_score(y_test, predictions_svc)
  print("SVC: ", ac_svc)
  
  # Stochastic Gradient Descent
  if sgd == None:
    sgd = SGDClassifier()
  sgd.fit(X_train,y_train)
  predictions_sgd = sgd.predict(X_test)
  ac_sgd = accuracy_score(y_test, predictions_sgd)
  print("SGD: ", ac_sgd)

  d = configs.copy()
  d.update({
      'start':start,
      'end':end,
      'lr':ac_lr,
      'mnb':ac_mnb,
      'svc':ac_svc,
      'sgd':ac_sgd
  })

  save_models(configs, lr, mnb, svc, sgd)

  return(d)

In [None]:
def start_training():

  config = {
      "sw":True,
      "vsw":True,
      "stemmer":True,
      "tfidf":True
  }

  """
    load_and_clean(filename) -> pd.DataFrame
    load_chunks(data, start=0, end=10000)
    normalizer(tweet, sw=True, stemmer=False)
    vectorizer(message, tfidf=True, vsw=False)
    normalize_and_vectorize(data, **kwargs) -> cv
    load_name_from_configs(configs)
    load_models(configs)
    save_models(configs, lr, mnb, svc, sgd)
    train_test_save(cv, label, start, end, config)
  """

  try:
    ledger = pd.read_csv('ledger.csv')
  except Exception as e:
    ledger = pd.DataFrame(columns=['start', 'end', 'sw', 'vsw', 'stemmer', 'tfidf', 'lr', 'mnb', 'svc', 'sgd' ])


  data = load_and_clean('training.1600000.processed.noemoticon.csv', 'latin1')

  create_vectorizer(data, **config)

  start = (ledger.iloc[ledger.shape[0] - 1]['end']) if (ledger.shape[0] > 0) else 0
  end = 0 # this the amount of data, the quantity is x2 for the data
  # as both positive and negative are taken equally to remove class imbalance

  for i in range(start, end, 10000):

    print("Start, End: ", i, i+10000)

    df = load_chunks(data, i, i+10000)
    
    cv = normalize_and_vectorize(df, **config)
    d = train_test_save(cv, df['label'], i, i+10000, config)
    
    ledger = ledger.append(d, ignore_index=True)
    ledger.to_csv('ledger.csv', index=False)

    print("\n")

In [None]:
def test_on_dataset(start=600_000, end=800_000):

  config = {
      "sw":True,
      "vsw":True,
      "stemmer":True,
      "tfidf":True
  }
  
  lr, mnb, svc, sgd = load_models(config)
  testdf = load_chunks(data, 600_000, 800_000)
  cv = normalize_and_vectorize(testdf, **config)

  test_lr = lr.predict(cv)
  print("LR: ", accuracy_score(testdf.label, test_lr))
  print(confusion_matrix(testdf.label, test_lr))
  print(classification_report(testdf.label, test_lr))

  test_mnb = mnb.predict(cv)
  print("MNB: ", accuracy_score(testdf.label, test_mnb))
  print(confusion_matrix(testdf.label, test_mnb))
  print(classification_report(testdf.label, test_mnb))

  test_svc = svc.predict(cv)
  print("SVC: ", accuracy_score(testdf.label, test_svc))
  print(confusion_matrix(testdf.label, test_svc))
  print(classification_report(testdf.label, test_svc))

  test_sgd = sgd.predict(cv)
  print("SGD: ", accuracy_score(testdf.label, test_sgd))
  print(confusion_matrix(testdf.label, test_sgd))
  print(classification_report(testdf.label, test_sgd))


In [None]:
def classify_with_models(text, **kwargs):

  config = {
      "sw":True,
      "vsw":True,
      "stemmer":True,
      "tfidf":True
  }

  string = normalizer(text, **config)
  cv = vectorizer([ string ], **config)

  lr, mnb, svc, sgd = load_models(config)

  print(text)
  print("LR:", "Happy" if lr.predict(cv)[0] == 4 else "Depressed", end="\t")
  print("MNB:", "Happy" if mnb.predict(cv)[0] == 4 else "Depressed", end="\t")
  print("SVC:", "Happy" if svc.predict(cv)[0] == 4 else "Depressed", end="\t")
  print("SGD:", "Happy" if sgd.predict(cv)[0] == 4 else "Depressed", end="\t")
  print('\n')

# Demo with Classification Models

In [None]:
# Uncomment only if you have the dataset downloaded

In [None]:
# data = load_and_clean('training.1600000.processed.noemoticon.csv', 'latin1')
# tweets = data

In [None]:
# depressive_words = ' '.join(list(tweets[tweets['label'] == 0]['message']))
# depressive_wc = WordCloud(width = 512,height = 512, collocations=False, colormap="Blues").generate(depressive_words)
# plt.figure(figsize = (8, 6), facecolor = 'k')
# plt.imshow(depressive_wc)
# plt.axis('off')
# plt.tight_layout(pad = 0)
# plt.show()

In [None]:
# positive_words = ' '.join(list(tweets[tweets['label'] == 4]['message']))
# positive_wc = WordCloud(width = 512,height = 512, collocations=False, colormap="Blues").generate(positive_words)
# plt.figure(figsize = (8, 6), facecolor = 'k')
# plt.imshow(positive_wc)
# plt.axis('off'), 
# plt.tight_layout(pad = 0)
# plt.show()

## Depressive Tweets

In [None]:
classify_with_models('Lately I have been feeling unsure of myself as a person & an artist')
classify_with_models('Extreme sadness, lack of energy, hopelessness')
classify_with_models('Hi hello depression and anxiety are the worst')
classify_with_models('I am officially done with @kanyewest')
classify_with_models('Feeling down...')
classify_with_models('My depression will not let me work out')
classify_with_models("I wish life could be better")
classify_with_models("I feel like attempting suicide")

In [None]:
classify_with_models("I am happy to be alone and lonely")

In [None]:
classify_with_models('lately i have been feeling very lonely and depressed')

In [None]:
classify_with_models('Loving how me and my lovely partner is talking about what we want.')
classify_with_models('Very rewarding when a patient hugs you and tells you they feel great after changing the diet and daily habits')
classify_with_models('Happy Thursday everyone. Thought today was Wednesday so super happy tomorrow is Friday yayyyyy')
classify_with_models('It’s the little things that make me smile. Got our new car today and this arrived with it')

In [None]:
classify_with_models('I am not satisfied with their teaching')



---



# Testing and Reports

Run with *399,999* data points.



## Logistic Regression

**Accuracy:**  0.7275868189670475

**Confusion Matrix**:

- | Predicted No | Predicted Yes
--- | --- | ---
Actual No | 136825 | 63174
Actual Yes | 45791 | 154209

![LR Normalized Confusion Matrix](https://drive.google.com/uc?id=1FIjDLMdq1X1dlDeYI-4AK5LuBVIJvH3U)


**Classification Report**:

- | precision | recall | f1-score | support
--- | --- | --- | --- | ---
0 | 0.75 | 0.68 | 0.72 | 199999
4 | 0.71 | 0.77 | 0.74 | 200000
accuracy | | | 0.73 | 399999
macro avg | 0.73 | 0.73 | 0.73 | 399999
weighted avg | 0.73 | 0.73 | 0.73 | 399999



---



## Multinomial Naive Bayes


**Accuracy**:  0.7143142857857144

**Confusion Matrix**:

- | Predicted No | Predicted Yes
--- | --- | ---
Actual No | 145915 | 54084
Actual Yes | 60190 | 139810

![MNB Normalized Confusion Matrix](https://drive.google.com/uc?id=1XcY4eKrPGYVAheO0ZMUQqi-_DiU0Z_6j)

**Classification Report**:

- | precision | recall | f1-score | support
--- | --- | --- | --- | ---
0 | 0.71 | 0.73 | 0.72 | 199999
4 | 0.72 | 0.70 | 0.71 | 200000
accuracy | | | 0.71 | 399999
macro avg | 0.71 | 0.71 | 0.71 | 399999
weighted avg | 0.71 | 0.71 | 0.71 | 399999



---



## Support Vector Classifier


**Accuracy**:  0.7306768266920667

**Confusion Matrix**:

- | Predicted No | Predicted Yes
--- | --- | ---
Actual No | 136128 | 63871
Actual Yes | 43858 | 156142

![SVC Normalized Confusion Matrix](https://drive.google.com/uc?id=171NyDpoH2bkuSXWUjQhph5ENFRS8qxB3)

**Classification Report**:

- | precision | recall | f1-score | support
--- | --- | --- | --- | ---
0 | 0.76 | 0.68 | 0.72 | 199999
4 | 0.71 | 0.78 | 0.74 | 200000
accuracy | | | 0.73 | 399999
macro avg | 0.73 | 0.73 | 0.73 | 399999
weighted avg | 0.73 | 0.73 | 0.73 | 399999



---



## Stochastic Gradient Descent


**Accuracy**:  0.7262618156545392

**Confusion Matrix**:

- | Predicted No | Predicted Yes
--- | --- | ---
Actual No | 133102 | 66897
Actual Yes | 42598 | 15740

![SGD Normalized Confusion Matrix](https://drive.google.com/uc?id=1FM1qY9Ky-UgFxz0GY2nNE26IAWJroitx)

**Classification Report**:

- | precision | recall | f1-score | support
--- | --- | --- | --- | ---
0 | 0.76 | 0.67 | 0.71 | 199999
4 | 0.70 | 0.79 | 0.74 | 200000
accuracy | | | 0.73 | 399999
macro avg | 0.73 | 0.73 | 0.73 | 399999
weighted avg | 0.73 | 0.73 | 0.73 | 399999
