# Comet Test Notebook 

In [42]:
from comet_ml import Experiment
experiment = Experiment(api_key="zNkJjcVKOMD5gKd05z6CwT4OD",
                        project_name="team-rm5-sigmoidfreuds", workspace="lizette95")

COMET INFO: old comet version (3.1.11) detected. current: 3.1.12 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/lizette95/team-rm5-sigmoidfreuds/8982e217023d48a7bcef986270a9e081



In [43]:
# Ignore warnings
import warnings
warnings.simplefilter(action='ignore')

# Install Prerequisites
# import sys
# !{sys.executable} -m pip install wordcloud comet_ml scikit-learn scikit-plot
# nltk.download('vader_lexicon')

# Exploratory Data Analysis
import re
import time
import nltk
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from textblob import TextBlob
import matplotlib.pyplot as plt

# Data Preprocessing
import string
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Modelling
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Performance Evaluation
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import GridSearchCV
from scikitplot.metrics import plot_roc, plot_confusion_matrix
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix, precision_score, recall_score

# Display
%matplotlib inline
sns.set(font_scale=1)
sns.set_style("white")

In [44]:
train_data = pd.read_csv('Data/train.csv')

In [45]:
def clean(df):
    df['token'] = df['message'].apply(TweetTokenizer().tokenize) ## first we tokenize
    df['punc'] = df['token'].apply(lambda x : [i for i in x if i not in list(string.punctuation)]) ## remove punctuations
    df['dig'] = df['punc'].apply(lambda x: [i for i in x if i not in list(string.digits)]) ## remove digits
    df['final'] = df['dig'].apply(lambda x: [i for i in x if len(i) > 1]) ## remove all words with only 1 character
    return df['final']

train_data['final'] = clean(train_data)

In [46]:
def get_part_of_speech(word):
    probable_part_of_speech = wordnet.synsets(word) ## finding word that is most similar (synonyms) for semantic reasoning
    pos_counts = Counter() # instantiating our counter class
    
    ## finding part of speech of word if part of speech is either noun, verb, adjective etc and add it up in a list
    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0] ## will extract the most likely part of speech from the list
    return most_likely_part_of_speech

normalizer = WordNetLemmatizer()
def lemmatise_words(df):
    df['lemma'] = df['final'].apply(lambda x: [normalizer.lemmatize(token, get_part_of_speech(token)) for token in x]) ## lemmatize by way of applying part of speech
    return df['lemma']

train_data['lemma'] = lemmatise_words(train_data)

In [47]:
X = train_data['lemma']
y = train_data['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state = 42)

In [48]:
X_train = list(X_train.apply(' '.join))
X_val = list(X_val.apply(' '.join))
vectorizer = TfidfVectorizer(sublinear_tf=True, smooth_idf = True, max_df = 0.3, min_df = 5, token_pattern = r'\w{1,}', strip_accents = 'ascii', ngram_range = (1, 5))
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)

### Logistic Regression

In [49]:
# logreg = LogisticRegression(n_jobs=1, C=1e5)
# logreg.fit(X_train, y_train)
# y_pred = logreg.predict(X_val)
# print(classification_report(y_val, y_pred))

In [50]:
# print("\nResults\nConfusion matrix \n {}".format(confusion_matrix(y_val, y_pred)))
# f1 = f1_score(y_val, y_pred,average="macro")
# precision = precision_score(y_val, y_pred,average="macro")
# recall = recall_score(y_val, y_pred,average="macro")
# param_grid = {'penalty': ['l1','l2'], 'C': [1,100,1e5],'multi_class' : ['auto', 'ovr', 'multinomial']}
# params = {"random_state": 42,
#           "model_type": "logreg",
#           "scaler": "standard scaler",
#           "param_grid": "str(param_grid)",
#           "stratify": True,
#           }
# metrics = {"f1": f1,
#            "recall": recall,
#            "precision": precision
#            }

In [51]:
# experiment.log_parameters(params)
# experiment.log_metrics(metrics)

### Linear SVC

In [52]:
linsvc = LinearSVC()
linsvc .fit(X_train, y_train)
y_pred = logreg.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

          -1       0.65      0.55      0.59       126
           0       0.50      0.51      0.50       224
           1       0.81      0.83      0.82       895
           2       0.78      0.76      0.77       337

    accuracy                           0.75      1582
   macro avg       0.68      0.66      0.67      1582
weighted avg       0.75      0.75      0.75      1582



In [53]:
print("\nResults\nConfusion matrix \n {}".format(confusion_matrix(y_val, y_pred)))
f1 = f1_score(y_val, y_pred,average="macro")
precision = precision_score(y_val, y_pred,average="macro")
recall = recall_score(y_val, y_pred,average="macro")
param_grid = {'penalty': ['l1','l2'], 'C': [0.1,1,10,100,1000],'multi_class' : ['crammer_singer', 'ovr']}
params = {"random_state": 42,
          "model_type": "linsvc",
          "scaler": "standard scaler",
          "param_grid": "str(param_grid)",
          "stratify": True,
          }
metrics = {"f1": f1,
           "recall": recall,
           "precision": precision
           }


Results
Confusion matrix 
 [[ 69  23  31   3]
 [ 14 115  78  17]
 [ 21  77 745  52]
 [  2  17  63 255]]


In [54]:
experiment.log_parameters(params)
experiment.log_metrics(metrics)

In [55]:
experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/lizette95/team-rm5-sigmoidfreuds/8982e217023d48a7bcef986270a9e081
COMET INFO:   Metrics:
COMET INFO:     f1        : 0.6723954115038295
COMET INFO:     precision : 0.6847203520316227
COMET INFO:     recall    : 0.6625226743155694
COMET INFO:   Parameters:
COMET INFO:     model_type   : linsvc
COMET INFO:     param_grid   : str(param_grid)
COMET INFO:     random_state : 42
COMET INFO:     scaler       : standard scaler
COMET INFO:     stratify     : True
COMET INFO:   Uploads:
COMET INFO:     code                : 1 (26 KB)
COMET INFO:     environment details : 1
COMET INFO:     filename            : 1
COMET INFO:     installed packages  : 1
COMET INFO:     notebook            : 1
COMET INFO: ---------------------------
COMET INFO: Uploading stats 

In [56]:
experiment.display()