In [15]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mw21807\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
data = pd.read_csv("tweets.csv")
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [17]:
features = data.iloc[:, 10].values
labels = data.iloc[:, 1].values

In [18]:
def clean_n_vect(feat):
    processed_feats = []

    for sentence in range(0, len(feat)):
        # Remove all the special characters
        processed_feature = re.sub(r'\W', ' ', str(feat[sentence]))

        # remove all single characters
        processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

        # Remove single characters from the start
        processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

        # Substituting multiple spaces with single space
        processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

        # Removing prefixed 'b'
        processed_feature = re.sub(r'^b\s+', '', processed_feature)

        # Converting to Lowercase
        processed_feature = processed_feature.lower()

        processed_feats.append(processed_feature)
    vectorizer = TfidfVectorizer (max_features=2300, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
    processed_feats = vectorizer.fit_transform(processed_feats).toarray()
    print(type(processed_feats))
    return processed_feats

In [19]:
processed_features = clean_n_vect(features)
X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

<class 'numpy.ndarray'>


In [28]:
y_train

array(['positive', 'negative', 'negative', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [25]:
def eval_metrics(y_test, predictions):
        confusionMatrix = confusion_matrix(y_test,predictions)
        classificationReport = classification_report(y_test,predictions)
        accScore = accuracy_score(y_test, predictions)
        return confusionMatrix, classificationReport, accScore

In [23]:
def train(in_n_est):
    # Set default values if no n_est is provided
    if int(in_n_est) is None:
        n_est = 200
    else:
        n_est = int(in_n_est) 
        
    text_classifier = RandomForestClassifier(n_estimators=n_est, random_state=0)
    text_classifier.fit(X_train, y_train)

    predictions = text_classifier.predict(X_test)

    (confusionMatrix, classificationReport, accScore) = eval_metrics(y_test, predictions)

    print("Random Forest Classifier model (n_estimators=%f):" % (n_est))
    print(confusionMatrix)
    print(classificationReport)
    print("  Accuracy: %s" % accScore)

In [26]:
train(200)

Random Forest Classifier model (n_estimators=200.000000):
[[1726  108   36]
 [ 332  243   39]
 [ 141   60  243]]
              precision    recall  f1-score   support

    negative       0.78      0.92      0.85      1870
     neutral       0.59      0.40      0.47       614
    positive       0.76      0.55      0.64       444

    accuracy                           0.76      2928
   macro avg       0.71      0.62      0.65      2928
weighted avg       0.74      0.76      0.74      2928

  Accuracy: 0.755464480874317


In [29]:
train(50)

Random Forest Classifier model (n_estimators=50.000000):
[[1711  119   40]
 [ 327  250   37]
 [ 149   63  232]]
              precision    recall  f1-score   support

    negative       0.78      0.91      0.84      1870
     neutral       0.58      0.41      0.48       614
    positive       0.75      0.52      0.62       444

    accuracy                           0.75      2928
   macro avg       0.70      0.61      0.65      2928
weighted avg       0.73      0.75      0.73      2928

  Accuracy: 0.7489754098360656


In [30]:
train(500)

Random Forest Classifier model (n_estimators=500.000000):
[[1722  109   39]
 [ 325  247   42]
 [ 135   60  249]]
              precision    recall  f1-score   support

    negative       0.79      0.92      0.85      1870
     neutral       0.59      0.40      0.48       614
    positive       0.75      0.56      0.64       444

    accuracy                           0.76      2928
   macro avg       0.71      0.63      0.66      2928
weighted avg       0.74      0.76      0.74      2928

  Accuracy: 0.7575136612021858
