## Get the Data

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline

from wordcloud import WordCloud, STOPWORDS
import tweepy
import re


import twitterconfig as tc
#from sklearn import datasets

from nltk import ngrams

In [None]:
#!pip install tweepy
#!pip install wordcloud

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)



In [None]:
def remove_url(text):
    return re.sub(r'http\S+', '', text)

In [None]:
remove_url('this is to test http://localhost:8888/notebooks/ how to remove url ')

### To be able to download twitter data
* Sign in to twitter and go to: https://developer.twitter.com/

* After logging in to the portal, and going to "Applications", a new application can be created which will provide the needed data for communicating with Twitter API.

In [None]:
import tweepy

# Variables that contains the user credentials to access Twitter API 
ACCESS_TOKEN = tc.ACCESS_TOKEN
ACCESS_TOKEN_SECRET = tc.ACCESS_TOKEN_SECRET 
CONSUMER_KEY = tc.CONSUMER_KEY 
CONSUMER_SECRET = tc.CONSUMER_SECRET

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = tweepy.API(auth,wait_on_rate_limit=True)
        
with open("test_twitter1.txt", "w") as file:
    # you ignore all retweets by adding -filter:retweets to your query.
    for tweet_info in tweepy.Cursor(api.search, q='Python', lang = 'en', tweet_mode='extended').items(10):
        #(not tweet.retweeted) and ('RT @' not in tweet.text)
        if 'retweeted_status' in dir(tweet_info):
            full_text = tweet_info.retweeted_status.full_text
        else:
            full_text = tweet_info.full_text

        full_text = remove_emoji(full_text)
        no_urls_no_tags = " ".join([word for word in full_text.split()
                                    if 'http' not in word
                                        and not word.startswith('@')
                                        and word != 'RT'
                                    ])
        # save the tweet to the file
        file.write(no_urls_no_tags+"\n")

        print(tweet_info._json['user']['screen_name'],':',no_urls_no_tags)
        print(no_urls_no_tags)
        print('-----')
   


In [None]:
with open('test_twitter1.txt') as f:
    content = f.read()
    
wordcloud = WordCloud(
                      #font_path='/Users/nsadawi/Library//R/3.2/library/rmarkdown/rmd/h/bootstrap-3.3.5/css/fonts/Roboto.ttf',
                      stopwords=STOPWORDS,
                      background_color='black',
                      width=1800,
                      height=1400
                     ).generate(content)

plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('my_wordcloud_1.png', dpi=300)
plt.show()

### Read Data

We'll use Pandas's **read_csv**, to load an already existing dataset


In [None]:
df = pd.read_csv(r'testdata.manual.2009.06.14.csv',header=None, encoding="cp1252")
#df = pd.read_csv(r'\\ikb\\home\\n\\csstnns\\Downloads\\twitter_data\\training.1600000.processed.noemoticon.csv',header=None, encoding="cp1252")

In [None]:
# data does not have column names so let's provide them
df.columns = ['polarity', 'tweet ID', 'date', 'query', 'username' , 'tweet']
df.head()

### We are interested in two columns only: 'tweet' and 'polarity'

In [None]:
df = df[['tweet','polarity']]
df.head()

In [None]:
## let's change column names for compatibility
df.columns = ['Text', 'Category']

## Exploratory Data Analysis

In [None]:
df.head()

In [None]:
df['Category'].value_counts()

In [None]:
df.describe()

Let's use **groupby** to use describe by label, this way we can begin to think about the features that separate different categories

In [None]:
df.groupby('Category').describe()

As we continue our analysis we want to start thinking about the features we are going to be using. This goes along with the general idea of [feature engineering](https://en.wikipedia.org/wiki/Feature_engineering). The better your domain knowledge on the data, the better your ability to engineer more features from it. Feature engineering is a very large part of text classification in general. I encourage you to read up on the topic!

Let's make a new column to detect how long each text entry is!

In [None]:
# length here is the number of chars
df['length'] = df['Text'].apply(len)
df.head()

In [None]:
df['Text'][321]

### Some Data Visualization

#### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
clf = MultinomialNB()
scores = cross_val_score(clf, text_tfidf, df['Category'],  cv=8)
#scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Training a RandomForest model

In [None]:
from sklearn.ensemble import RandomForestClassifier

#### Cross Validation

In [None]:
clf = RandomForestClassifier()
scores = cross_val_score(clf, text_tfidf, df['Category'],  cv=8)
#scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

## Plot the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split

start = time.time()
clf = MultinomialNB()
#classifier = Pipeline([('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english') + list(string.punctuation))),('classifier', LinearSVC(C=10))])
X_train, X_test, y_train, y_test = train_test_split(text_tfidf, df['Category'], test_size=0.2, random_state=11)
clf.fit(X_train, y_train)
end = time.time()

print("Accuracy: " + str(clf.score(X_test, y_test)) + ", Time duration: " + str(end - start))

y_pred = clf.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)

# Plot confusion_matrix
fig, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(conf_mat, annot=True, cmap = "Set3", fmt ="d",
xticklabels=df.Category.unique(), yticklabels=df.Category.unique())
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()
