In [None]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
!wget --no-check-certificate --content-disposition https://raw.githubusercontent.com/satyajeetkrjha/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv

--2022-04-09 04:02:25--  https://raw.githubusercontent.com/satyajeetkrjha/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3421431 (3.3M) [text/plain]
Saving to: ‘Tweets.csv’


2022-04-09 04:02:25 (252 MB/s) - ‘Tweets.csv’ saved [3421431/3421431]



In [None]:
df = pd.read_csv('Tweets.csv',index_col=None)
df = df[['airline_sentiment','text']]
df.rename({'airline_sentiment': 'label'}, axis=1, inplace=True)
df.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
negative,9178
neutral,3099
positive,2363


In [None]:
df.label = df.label.replace({'positive': 0, 'neutral': 0, 'negative': 1})

In [None]:
df.sample(10)

Unnamed: 0,label,text
3258,1,@united make sure you make Cancelled Flighted ...
12562,1,"@AmericanAir no, you should do something about..."
11117,0,@USAirways thanks to the gate agent in State C...
1741,1,@united pls stop sending texts every 15mins sa...
3282,0,@united Thank you for the Delta transfer. Will...
1337,1,@united Maybe be hiring your own ground staff ...
14130,1,@AmericanAir Trying to rebook a flight with yo...
9062,1,@USAirways I've been on hold to rebook a Cance...
14304,1,@AmericanAir @NY_NJairports AA1224: 45 mins fo...
11748,1,@USAirways still can't get a real person on th...


In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.3)

y_train, y_test = train.label, test.label

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
import string
import re

tokenizer = TweetTokenizer()

def tokenizer_fn(text):
  # remove numbers
  text_nonum = re.sub(r'\d+', '', text)
  # apply Twitter Tokenizer
  tokens = tokenizer.tokenize(text_nonum)
  #remove punctuation
  tokens = list(filter(lambda token: token not in string.punctuation, tokens))

  return tokens

vectorizer = CountVectorizer(tokenizer=tokenizer_fn, stop_words=stopwords.words('english'), lowercase=True, max_df=0.5, min_df=3)

X_train = vectorizer.fit_transform(train.text)
X_test = vectorizer.transform(test.text)
y_train, y_test = train.label, test.label

In [None]:
word_freq = pd.Series(dict(zip(vectorizer.get_feature_names(), X_train.toarray().sum(axis=0))))
word_freq.sort_values(ascending=False)[:20]



@united          2771
flight           2765
@usairways       2115
@americanair     2030
@southwestair    1711
@jetblue         1558
get               963
thanks            741
cancelled         715
...               689
service           685
help              594
time              550
customer          520
hours             497
i'm               496
us                495
hold              448
flights           443
still             433
dtype: int64

**1) train logistic regression with L1 penalty จาก feature ที่ทำไว้ โดยที่จะต้อง tune regularization parameter ของ L1 penalty ด้วย cross validation พอได้ model ที่เทรนแล้วก็ทำ error analysis ดู precision/recall/AUC บน test data ที่เราเตรียมไว้**

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1, penalty='l1', solver='liblinear',random_state=0)

In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
params = [{"C": [0.5,2,10,50]}]
clf = GridSearchCV(model, params , cv= 3)
clf.fit(X_train, y_train)


GridSearchCV(cv=3,
             estimator=LogisticRegression(C=1, penalty='l1', random_state=0,
                                          solver='liblinear'),
             param_grid=[{'C': [0.5, 2, 10, 50]}])

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print('AUC Score: {}'.format(roc_auc_score(y_test, clf.decision_function(X_test))))

              precision    recall  f1-score   support

           0       0.76      0.76      0.76      1645
           1       0.85      0.86      0.85      2747

    accuracy                           0.82      4392
   macro avg       0.81      0.81      0.81      4392
weighted avg       0.82      0.82      0.82      4392

AUC Score: 0.8914091415559169


**2) ลอง train random forest model และ tune hyper-parameters แล้วดูว่าสามารถ outperform logistic regression ได้มั้ย**

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

param_grid = { 'max_features': ['sqrt'],'max_depth': [20,30,40, 100, 110],'n_estimators': [150, 200, 250],}
clf = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
clf.fit(X_train,y_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [20, 30, 40, 100, 110],
                         'max_features': ['sqrt'],
                         'n_estimators': [150, 200, 250]},
             verbose=2)

In [None]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print('AUC Score: {}'.format(roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])))

              precision    recall  f1-score   support

           0       0.77      0.66      0.71      1645
           1       0.81      0.89      0.85      2747

    accuracy                           0.80      4392
   macro avg       0.79      0.77      0.78      4392
weighted avg       0.80      0.80      0.80      4392

AUC Score: 0.8682756430612893


**3) ลองเปลี่ยน feature engineering จาก unigram เป็น bigram (CountVecterizer สามารถทำได้ ให้ลองดูจาก documentation ของ sklearn) จากนั้น train logistic regression model บน bigram features และเปลี่ยนเทียบ performance ของ model กับ model ที่ใช้แค่ unigram**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
import string
import re

tokenizer = TweetTokenizer()

def tokenizer_fn(text):
  # remove numbers
  text_nonum = re.sub(r'\d+', '', text)
  # apply Twitter Tokenizer
  tokens = tokenizer.tokenize(text_nonum)
  #remove punctuation
  tokens = list(filter(lambda token: token not in string.punctuation, tokens))

  return tokens

In [None]:
vectorizer2 = CountVectorizer(tokenizer=tokenizer_fn, stop_words=stopwords.words('english'), lowercase=True, max_df=0.5, min_df=3, ngram_range=(2, 2))
X2_train = vectorizer2.fit_transform(train.text)
X2_test = vectorizer2.transform(test.text)
y2_train, y2_test = train.label, test.label

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=1, penalty='l1', solver='liblinear',random_state=0)

In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
params = [{"C": [0.5,2,10,50]}]
clf = GridSearchCV(model, params , cv= 3)
clf.fit(X2_train, y2_train)

GridSearchCV(cv=3,
             estimator=LogisticRegression(C=1, penalty='l1', random_state=0,
                                          solver='liblinear'),
             param_grid=[{'C': [0.5, 2, 10, 50]}])

In [None]:
from sklearn.metrics import classification_report

y2_pred = clf.predict(X2_test)
print(classification_report(y2_test, y2_pred))
print('AUC Score: {}'.format(roc_auc_score(y2_test, clf.decision_function(X2_test))))

              precision    recall  f1-score   support

           0       0.71      0.42      0.53      1645
           1       0.72      0.90      0.80      2747

    accuracy                           0.72      4392
   macro avg       0.71      0.66      0.66      4392
weighted avg       0.71      0.72      0.70      4392

AUC Score: 0.7851345983404941


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

param_grid = { 'max_features': ['sqrt'],'max_depth': [20,30,40, 100, 110],'n_estimators': [150, 200, 250],}
clf = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
clf.fit(X2_train,y2_train)

Fitting 3 folds for each of 15 candidates, totalling 45 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [20, 30, 40, 100, 110],
                         'max_features': ['sqrt'],
                         'n_estimators': [150, 200, 250]},
             verbose=2)

In [None]:
from sklearn.metrics import classification_report

y2_pred = clf.predict(X2_test)
print(classification_report(y2_test, y2_pred))
print('AUC Score: {}'.format(roc_auc_score(y2_test, clf.predict_proba(X2_test)[:,1])))

              precision    recall  f1-score   support

           0       0.79      0.24      0.37      1645
           1       0.68      0.96      0.80      2747

    accuracy                           0.69      4392
   macro avg       0.73      0.60      0.58      4392
weighted avg       0.72      0.69      0.64      4392

AUC Score: 0.7426736876813944


**สรุป: หลังเปลี่ยนเป็น bigram พบว่าประสิทธิภาพแย่ลงทั้ง Logistic regression และ RandomForest**