In [17]:
#importing dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re


from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
#read the tweets data
tweets_df = pd.read_csv("Tweets_source_data.csv")
# 
len(tweets_df)

14640

In [None]:
#displays tweets share by airlines
tweets_df.airline.value_counts().plot(kind='pie', autopct='%1.0f%%', radius=1.45, label='',shadow=True)
plt.savefig('../images/airlinestweets.png')

In [None]:
#display count of the different sentiments - negative, neutral, positive
tweets_df.airline_sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%', radius=1.48, label='',shadow=True)
plt.savefig('../images/tweetsenti.png')

In [None]:
#represent sentiments by airlines on a bar chart
tweets_df.groupby(['airline', 'airline_sentiment']).airline_sentiment.count().unstack().plot(kind='bar', figsize=(12,7), legend=True, title='Tweets by Airlines')
plt.savefig('../images/tweets_by_sent.png')
#plt.legend(loc='best')
# sentiment_airline.plot(kind='bar', loc='best')

# sns.catplot(x="airline", y="countval", hue="airline_sentiment", data=sentiment_airline,
#             height=6, kind="bar", palette="muted")

# Text Processing
- Looking through the different tweets

In [5]:
#processing each individual tweet, by removing multiple spaces, single character, airline name
tweets = tweets_df['text']
tweets

processed_tweets = []

for line in range(0, len(tweets)):
#     proc_line = tweets[line].split()[1:]
    # Remove all the special characters
    proc_line = re.sub(r'\W', ' ', str(tweets[line]))

    # remove all single characters
    proc_line= re.sub(r'\s+[a-zA-Z]\s+', ' ', proc_line)

    # Remove single characters from the start
    proc_line = re.sub(r'\^[a-zA-Z]\s+', ' ', proc_line) 

    # Substituting multiple spaces with single space
    proc_line = re.sub(r'\s+', ' ', proc_line, flags=re.I)

    # Removing prefixed 'b'
    proc_line = re.sub(r'^b\s+', '', proc_line)

    # Converting to Lowercase
    proc_line = proc_line.lower()
    
    # remove the name of the airline
    proc_line = ' '.join(proc_line.split()[1:])
    
    # add to the created list
    processed_tweets.append(proc_line)
    

#add a new column to the df with the clean tweet
tweets_df['tweets'] = processed_tweets
#tweets_df.head()


In [None]:
#create a word cloud of the most popular word in the tweets.
def wordcloud(tweets,col):
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(background_color='white', colormap="gnuplot", contour_color='steelblue', max_words=2000, stopwords=stopwords).generate(" ".join([i for i in tweets[col]]))
    plt.figure( figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation="bilinear")
    plt.axis("off")
    plt.savefig('../images/wordcloud.png')
wordcloud(tweets_df,'tweets')  

# Creating a Model
- We will classify and use only the top 5000 words that occur in 80% of the sentences.
- We will use random forest classification 

In [6]:
#creating a vectorizer to be used for classification
vectorizer = TfidfVectorizer (max_features=5000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_tweets = vectorizer.fit_transform(processed_tweets).toarray()

In [10]:
#setting our train and test data
airlines = tweets_df['airline']
X_train, X_test, y_train, y_test = train_test_split(processed_tweets, airlines, test_size=0.2, random_state=0)

In [19]:
#train our model
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [20]:
#make prediction
predictions = text_classifier.predict(X_test)

In [21]:
#generating reports

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[233  32  16  73 193   0]
 [ 27 189  40  53 150   2]
 [ 51  37 160  59 179   2]
 [ 80  21  27 260 212   0]
 [ 58  36  47  81 497   2]
 [ 12  17  15  10  48   9]]
                precision    recall  f1-score   support

      American       0.51      0.43      0.46       547
         Delta       0.57      0.41      0.48       461
     Southwest       0.52      0.33      0.40       488
    US Airways       0.49      0.43      0.46       600
        United       0.39      0.69      0.50       721
Virgin America       0.60      0.08      0.14       111

      accuracy                           0.46      2928
     macro avg       0.51      0.39      0.41      2928
  weighted avg       0.49      0.46      0.45      2928

0.4603825136612022
