In [72]:
import pandas as pd
import numpy as np
import re
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV



# Importing the NLTK library
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [73]:
!pip install nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [74]:
test_path = 'test_twitter_x_test.csv'
train_path = 'training_twitter_x_y_train.csv'

In [75]:
train_data = pd.read_csv(train_path).copy()

In [76]:
train_data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)


In [77]:
documents = []
for i in range(train_data.shape[0]) :
    text = train_data.loc[i, 'text']
    url_pattern = r'https?://\S+|www\.\S+|t\.co/\S+'
    email_pattern = r'\b\w+@\w+\.\w+\b'
    username_pattern = r'@\w+'
    emoji_pattern = r'[\U00010000-\U0010FFFF]'

    text_cleaned = re.sub(url_pattern, '', text)
    text_cleaned = re.sub(email_pattern, '', text_cleaned)
    text_cleaned = re.sub(username_pattern, '', text_cleaned)
    text_cleaned = re.sub(emoji_pattern, '', text_cleaned)

    words = re.findall(r'\b[a-zA-Z]+\b', text_cleaned)
    documents.append([words, train_data.loc[i, 'airline_sentiment']])

In [78]:
documents[:5]

[[['I',
   'am',
   'scheduled',
   'for',
   'the',
   'morning',
   'days',
   'after',
   'the',
   'fact',
   'yes',
   'not',
   'sure',
   'why',
   'my',
   'evening',
   'flight',
   'was',
   'the',
   'only',
   'one',
   'Cancelled',
   'Flightled'],
  'negative'],
 [['seeing',
   'your',
   'workers',
   'time',
   'in',
   'and',
   'time',
   'out',
   'going',
   'above',
   'and',
   'beyond',
   'is',
   'why',
   'I',
   'love',
   'flying',
   'with',
   'you',
   'guys',
   'Thank',
   'you'],
  'positive'],
 [['Flew',
   'ORD',
   'to',
   'Miami',
   'and',
   'back',
   'and',
   'had',
   'great',
   'crew',
   'service',
   'on',
   'both',
   'legs',
   'THANKS'],
  'positive'],
 [['that', 's', 'horse', 'radish'], 'negative'],
 [['so',
   'our',
   'flight',
   'into',
   'ORD',
   'was',
   'delayed',
   'because',
   'of',
   'Air',
   'Force',
   'One',
   'but',
   'the',
   'last',
   'flight',
   'to',
   'SBN',
   'is',
   'at',
   'mins',
   'from',
  

In [80]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [81]:
# importing punctuations
import string
punctuations = list(string.punctuation)
stop += punctuations
stop += ['flight','airline','flights','AA', 'aa']

In [82]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [83]:
from nltk import pos_tag

In [84]:
from nltk.corpus import wordnet
def get_simple_pos (tag) :
    if tag.startswith('J') :
        return wordnet.ADJ
    elif tag.startswith('V') :
        return wordnet.VERB
    elif tag.startswith('N') :
        return wordnet.NOUN
    elif tag.startswith('R') :
        return wordnet.ADV
    else :
        return wordnet.NOUN

In [85]:
def clean_document (words) :
    # it should not be a stop word and we have to lemmatize it by getting pos tag
    output_words = []
    for w in words :
        if w.lower() not in stop :
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word)
    return output_words

In [86]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [87]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [88]:
documents = [[clean_document(document), category] for document, category in documents]

In [89]:
all_words = [" ".join(document) for document,category in documents]

In [90]:
all_words[:3]

['schedule morning day fact yes sure even one Cancelled Flightled',
 'see worker time time go beyond love fly guy Thank',
 'Flew ORD Miami back great crew service leg THANKS']

In [91]:
categories = [category for document, category in documents]


In [92]:
from sklearn.model_selection import train_test_split

In [93]:
x_train, x_test, y_train, y_test = train_test_split(all_words, categories, random_state=0)

In [94]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [95]:
count_vec = CountVectorizer(max_features=8000)
data = count_vec.fit_transform(x_train)

In [96]:
count_vec.get_feature_names_out()

array(['aaaand', 'aadvantage', 'aafail', ..., 'zone', 'zoom', 'zurich'],
      dtype=object)

In [97]:
x_train = data.todense()
x_train

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [98]:
x_test = count_vec.transform(x_test).todense()

In [99]:
x_train = np.array(x_train)
x_test = np.array(x_test)

In [100]:
from sklearn.naive_bayes import MultinomialNB

In [101]:
models = {
    'Logistic Regression': (LogisticRegression(max_iter=1000), {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['liblinear']
    }),
    'Random Forest': (RandomForestClassifier(), {
        'n_estimators': [2, 5, 10, 20, 50],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }),
    'SVM': (SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }),
    'Naive Bayes': (MultinomialNB(), {
        'alpha': [0.1, 1, 10]
    })
}

In [102]:
best_models = {}
scores = []
import pandas as pd
for model_name, (model, params) in models.items():
    print(f"Tuning {model_name}...")

    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(x_train, y_train)
    scores.append({
        'model' : model_name,
        'best_score' : grid_search.best_score_,
        'best_params' : grid_search.best_params_
    })
    best_models[model_name] = grid_search.best_estimator_

Tuning Logistic Regression...
Tuning Random Forest...
Tuning SVM...
Tuning Naive Bayes...


In [103]:
scores

[{'model': 'Logistic Regression',
  'best_score': 0.7770491803278688,
  'best_params': {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}},
 {'model': 'Random Forest',
  'best_score': 0.7605343047965998,
  'best_params': {'max_depth': None,
   'min_samples_split': 5,
   'n_estimators': 50}},
 {'model': 'SVM',
  'best_score': 0.7666059502125075,
  'best_params': {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}},
 {'model': 'Naive Bayes',
  'best_score': 0.7509411050394658,
  'best_params': {'alpha': 1}}]

In [104]:
df = pd.DataFrame(scores)
df

Unnamed: 0,model,best_score,best_params
0,Logistic Regression,0.777049,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}"
1,Random Forest,0.760534,"{'max_depth': None, 'min_samples_split': 5, 'n..."
2,SVM,0.766606,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}"
3,Naive Bayes,0.750941,{'alpha': 1}


In [105]:
# use logistic regression as it has the best score
clf = LogisticRegression(max_iter=1000, C=1, penalty='l2', solver='liblinear')
clf.fit(x_train, y_train)

In [106]:
clf.score(x_test, y_test)

0.775591985428051

In [107]:
import pickle
with open('model.pkl', 'wb') as f :
    pickle.dump(clf, f)