In [1]:
#importing dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords


In [2]:
#read the tweets data
tweets_df = pd.read_csv("Tweets_source_data.csv")
# 
len(tweets_df)

14640

In [None]:
#displays tweets share by airlines
tweets_df.airline.value_counts().plot(kind='pie', autopct='%1.0f%%', radius=1.45, label='',shadow=True)
plt.savefig('../images/airlinestweets.png')

In [None]:
#display count of the different sentiments - negative, neutral, positive
tweets_df.airline_sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%', radius=1.48, label='',shadow=True)
plt.savefig('../images/tweetsenti.png')

In [None]:
#represent sentiments by airlines on a bar chart
tweets_df.groupby(['airline', 'airline_sentiment']).airline_sentiment.count().unstack().plot(kind='bar', figsize=(12,7), legend=True, title='Tweets by Airlines')
plt.savefig('../images/tweets_by_sent.png')
#plt.legend(loc='best')
# sentiment_airline.plot(kind='bar', loc='best')

# sns.catplot(x="airline", y="countval", hue="airline_sentiment", data=sentiment_airline,
#             height=6, kind="bar", palette="muted")

# Text Processing
- Looking through the different tweets

In [5]:
#processing each individual tweet, by removing multiple spaces, single character, airline name
tweets = tweets_df['text']
tweets

processed_tweets = []

for line in range(0, len(tweets)):
#     proc_line = tweets[line].split()[1:]
    # Remove all the special characters
    proc_line = re.sub(r'\W', ' ', str(tweets[line]))

    # remove all single characters
    proc_line= re.sub(r'\s+[a-zA-Z]\s+', ' ', proc_line)

    # Remove single characters from the start
    proc_line = re.sub(r'\^[a-zA-Z]\s+', ' ', proc_line) 

    # Substituting multiple spaces with single space
    proc_line = re.sub(r'\s+', ' ', proc_line, flags=re.I)

    # Removing prefixed 'b'
    proc_line = re.sub(r'^b\s+', '', proc_line)

    # Converting to Lowercase
    proc_line = proc_line.lower()
    
    # remove the name of the airline
    proc_line = ' '.join(proc_line.split()[1:])
    
    # add to the created list
    processed_tweets.append(proc_line)
    

#add a new column to the df with the clean tweet
tweets_df['tweets'] = processed_tweets
#tweets_df.head()


In [None]:
#create a word cloud of the most popular word in the tweets.
def wordcloud(tweets,col):
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(background_color='white', colormap="gnuplot", contour_color='steelblue', max_words=2000, stopwords=stopwords).generate(" ".join([i for i in tweets[col]]))
    plt.figure( figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud, cmap=plt.cm.gray, interpolation="bilinear")
    plt.axis("off")
    plt.savefig('../images/wordcloud.png')
wordcloud(tweets_df,'tweets')  

# Creating a Model
- We will classify and use only the top 5000 words that occur in 80% of the sentences.
- We will use random forest classification 

In [6]:
#creating a vectorizer to be used for classification
vectorizer = TfidfVectorizer (max_features=5000, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_tweets = vectorizer.fit_transform(processed_tweets).toarray()