<a href="https://colab.research.google.com/github/mryf323/ml_final_project/blob/main/phase1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [81]:
import itertools
import pandas as pd
from google_drive_downloader import GoogleDriveDownloader as gdd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


import advanced_processor_chain_factory
import simple_processor_chain_factory

In [57]:
gdd.download_file_from_google_drive(file_id='15JJ6ZysFM57tlUjXo2nHVhkGwePbVMVV',dest_path='./dataset.csv')

In [58]:
dataset = pd.read_csv('./dataset.csv')
dataset['sentiment'] = dataset['sentiment'].replace(['negative', 'positive'] , [0, 1])
dataset.head()

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",0
1,If you're a layman interested in quantum theor...,0
2,It's amazing that this no talent actor Chapa g...,0
3,This must be one of the most overrated Spanish...,0
4,Some critics have compared Chop Shop with the ...,1


In [59]:
def analysis(labels, predictions):
    #print("Report: Classification\n", classification_report(labels, predictions, target_names=["positive", "negative"]))
    #print("Matrix: Confusion\n", confusion_matrix(labels, predictions))
    print("Accuracy:\n", accuracy_score(labels, predictions))

In [60]:
def evaluate_models_with_data(models, X_train, X_test, Y_train, Y_test):
    for name, model in models.items():
        print(f'------Evaluating {name}------')
        model.fit(X_train, Y_train)
        pred = model.predict(X_test)
        analysis(Y_test, pred)

In [61]:
models = {'logistic regression' : LogisticRegression(class_weight = 'balanced'),
          'svm' : svm.SVC(),
          'knn' : KNeighborsClassifier(n_neighbors=8)
         }

In [62]:
def prepare_data(processor_chain = None, debug = False, debug_data_size = 4000):
    X , Y = dataset['comment'], dataset['sentiment']
    if debug:
        X , Y = X[:debug_data_size], Y[:debug_data_size]
    if processor_chain:
        X = X.apply(processor_chain.process)
    vectorizer = CountVectorizer(max_features = 2000)
    X = vectorizer.fit_transform(X)
    return train_test_split(X,Y)

In [63]:
evaluate_models_with_data(models, *prepare_data(debug = True))

------Evaluating logistic regression------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy:
 0.837
------Evaluating svm------
Accuracy:
 0.778
------Evaluating knn------
Accuracy:
 0.628


In [64]:
evaluate_models_with_data(models, *prepare_data(processor_chain=simple_processor_chain_factory.create(), debug=True))

------Evaluating logistic regression------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy:
 0.837
------Evaluating svm------
Accuracy:
 0.804
------Evaluating knn------
Accuracy:
 0.614


In [65]:
evaluate_models_with_data(models, *prepare_data(processor_chain=advanced_processor_chain_factory.create('lem'), debug=True))

------Evaluating logistic regression------
Accuracy:
 0.822
------Evaluating svm------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy:
 0.827
------Evaluating knn------
Accuracy:
 0.643


In [66]:
evaluate_models_with_data(models, *prepare_data(processor_chain=advanced_processor_chain_factory.create('stem'), debug=True))

------Evaluating logistic regression------
Accuracy:
 0.836
------Evaluating svm------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy:
 0.814
------Evaluating knn------
Accuracy:
 0.648


In [96]:
class Word2VecDataProvider:
  
  def __init__(self, processor_chain, debug = False, debug_data_size = 4000):
    X , Y = dataset['comment'], dataset['sentiment']
    if debug:
      X , Y = X[:debug_data_size] , Y[:debug_data_size]
    sentences = X.apply(nltk.sent_tokenize)
    sentences = sentences.apply(lambda com: 
                                [nltk.word_tokenize(processor_chain.process(s)) 
                                for s in com])
    self.data =  list(itertools.chain.from_iterable(sentences.to_list()))    

In [98]:
from gensim.models import word2vec

class Word2Vec: 
  
  def __init__(self, num_features=250, min_count=40,workers=4,
               window=10,sample=0.001):
    
    self.num_features=num_features
    self.min_count=min_count
    self.workers=workers
    self.window=window
    self.sample=sample
    
  
  def fit(data):
    self.model = word2vec.Word2Vec(data, workers = self.workers, 
                            size = self.num_features, min_count = self.min_count,
                            window = self.window, sample = self.sample)
    self.model.init_sims(replace = True)


  def predict(self,comment):
    result = np.zeros((self.num_features,), dtype = "float32")
    word_index = set(self.model.wv.index2word)
    nword = 0
    for word in comment:
        if word in word_index:
            nword += 1
            result = np.add(result, self.model[word])
    return np.divide(featureVec, nword)    

  

In [101]:
processor_chain = simple_processor_chain_factory.create()
word2vec_data = Word2VecDataProvider(processor_chain=processor_chain, debug=True).data
w2v_model = Word2Vec()
w2v_model.fit(word2vec_data)

def w2v_convertor(comment):
  return w2v_model.predict(nltk.word_tokenize(processor_chain.process(comment)))



  """Entry point for launching an IPython kernel.


[('awful', 0.923717737197876),
 ('terrible', 0.8872237801551819),
 ('lame', 0.8860093355178833),
 ('horrible', 0.8857911229133606),
 ('overall', 0.8806014060974121),
 ('predictable', 0.872204601764679),
 ('writing', 0.8648942708969116),
 ('average', 0.8497321605682373),
 ('scary', 0.84068363904953),
 ('totally', 0.8344157338142395)]