In [None]:
import pandas as pd
import random
import numpy as np
import json
import nltk
nltk.download('punkt')
import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#Download Data
!gdown --id 1s-8A8sF7b23Tb9Myoc_3DTl6YXLpL17L  # train data
!gdown --id 1EacvwnOHfwa4FiZy2K8mFpFjmpb4Mt-t #test data(no answer key)
!gdown --id 1YtAHCzeZUXGZQ9cimdkkUq4lUk3ZH-I_  # evaluate.py

Downloading...
From: https://drive.google.com/uc?id=1s-8A8sF7b23Tb9Myoc_3DTl6YXLpL17L
To: /content/lab4_train.csv
100% 331k/331k [00:00<00:00, 47.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EacvwnOHfwa4FiZy2K8mFpFjmpb4Mt-t
To: /content/lab4_test.csv
100% 36.2k/36.2k [00:00<00:00, 55.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1YtAHCzeZUXGZQ9cimdkkUq4lUk3ZH-I_
To: /content/evaluate.py
100% 7.03k/7.03k [00:00<00:00, 13.5MB/s]


In [None]:
#Split Train&Dev
data = pd.read_csv('lab4_train.csv') 
train, dev = np.split(data, [int(len(data)*0.8)]) # train:dev = 80:20

In [None]:
test_data = pd.read_csv('lab4_test.csv') 

In [None]:
#Tokenization
title_list = dev['text'].to_list()
tokenized=[]
for sentences in tqdm.tqdm(title_list):
  tokenized.append(nltk.word_tokenize(sentences))
dev['tokenized']=tokenized
train_title_list = train['text'].to_list()
train_tokenized=[]
for sentences in tqdm.tqdm(train_title_list):
  train_tokenized.append(nltk.word_tokenize(sentences))
train['tokenized']=train_tokenized

100%|██████████| 632/632 [00:00<00:00, 6851.19it/s]
100%|██████████| 2524/2524 [00:00<00:00, 7840.76it/s]


In [None]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemma = WordNetLemmatizer()
def stem(li):
  sli=[]
  ssli=[]
  for tk in li:
    sli.append(lemma.lemmatize(tk, pos="n"))
  return(sli)
dlt=dev['tokenized'].apply(stem)
tlt=train['tokenized'].apply(stem)
dev['tokenized']=dlt
train['tokenized']=tlt

In [None]:
#Bigram Creation
def create_bigram(tokenized):
  bigram = []
  
  if len(tokenized)>1:
    for i in range(len(tokenized)-1):
      bigram.append(tokenized[i]+' '+tokenized[i+1])
  return bigram
dev['bigram'] = dev['tokenized'].apply(create_bigram)
train['bigram'] = train['tokenized'].apply(create_bigram)

In [None]:
#New Dataframe for Prediction
pred_df = pd.DataFrame()
pred_df['id'] = dev['id']
pred_df['tokenized']=dev['tokenized']

In [None]:
#Create Features
def featurize(token_list):
    features = {}
    for token in token_list:
        features[token] = 1
    features['length'] = len(token_list)
    return features

In [None]:
#Train Model(Aspect Category)
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
vectorizer = DictVectorizer(sparse=True)
train_features = train['tokenized'].apply(featurize)
feature_vectors = vectorizer.fit_transform(train_features)
lr_text_classifier = LogisticRegression()
lr_text_classifier.fit(feature_vectors, train['aspectCategory'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#Predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
dev_featurized_list_dicts = pred_df['tokenized'].apply(featurize)
dev_feature_vector = vectorizer.transform(dev_featurized_list_dicts)
predictions = lr_text_classifier.predict(dev_feature_vector)

In [None]:
#Report
print (classification_report(dev['aspectCategory'], predictions))

                         precision    recall  f1-score   support

               ambience       0.54      0.45      0.49        71
anecdotes/miscellaneous       0.75      0.75      0.75       194
                   food       0.63      0.77      0.69       203
                  price       0.46      0.30      0.36        60
                service       0.65      0.57      0.61       104

               accuracy                           0.65       632
              macro avg       0.61      0.57      0.58       632
           weighted avg       0.64      0.65      0.64       632



In [None]:
pred_df['aspectCategory']=predictions

In [None]:
#Train Model(Polarity)
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
vectorizer = DictVectorizer(sparse=True)
train_features = train['tokenized'].apply(featurize)
feature_vectors = vectorizer.fit_transform(train_features)
lr_text_classifier = LogisticRegression()
lr_text_classifier.fit(feature_vectors, train['polarity'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#Predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
dev_featurized_list_dicts = pred_df['tokenized'].apply(featurize)
dev_feature_vector = vectorizer.transform(dev_featurized_list_dicts)
predictions = lr_text_classifier.predict(dev_feature_vector)

In [None]:
#Report
print (classification_report(dev['polarity'], predictions))

              precision    recall  f1-score   support

    conflict       0.30      0.28      0.29        25
    negative       0.54      0.40      0.46       150
     neutral       0.51      0.26      0.34        77
    positive       0.74      0.89      0.81       380

    accuracy                           0.67       632
   macro avg       0.52      0.46      0.48       632
weighted avg       0.65      0.67      0.65       632



In [None]:
pred_df['polarity']=predictions

In [None]:
# export to csv & evaluate
pred_df.to_csv('pred.csv', index=None)
!python3 evaluate.py lab4_train.csv pred.csv

=== CLASSIFICATION : ASPECT ===
                class name  precision  recall  F1-score support
0                     food      0.839   0.768     0.802     203
1                    price      0.783   0.300     0.434      60
2                  service      0.855   0.567     0.682     104
3                 ambience      0.780   0.451     0.571      71
4  anecdotes/miscellaneous      0.820   0.753     0.785     194
5                MACRO AVG      0.815   0.568     0.655     632
6                MICRO AVG      0.827   0.650     0.728     632 

=== CLASSIFICATION : SENTIMENT ===
  class name  precision  recall  F1-score support
0   positive      0.752   0.882     0.812     306
1   negative      0.522   0.376     0.437     125
2    neutral      0.606   0.270     0.374      74
3   conflict      0.467   0.292     0.359      24
4  MACRO AVG      0.587   0.455     0.496     529
5  MICRO AVG      0.692   0.650     0.671     529 

=== CLASSIFICATION : OVERALL ===
              precision  recall  F