In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import regex as re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pickle

In [3]:
df = pd.read_csv('../Data/train.csv')

In [4]:
df['text'] = df['text'].astype(str)

In [5]:
# Removing all punctuation 
df['text'] = df['text'].map(lambda x: re.sub('[^\w\s]', ' ', x)) 

In [6]:
#Only keeping letters 
df['text'] = df['text'].map(lambda x: re.sub("[^a-zA-Z]", " ", x)) 

___

## Textblob Subjectivity and Polarity

In [18]:
def sentiment(x):
    try:
        return TextBlob(x).sentiment.subjectivity
    except:
        return 10
#10 is placeholder in case of an error

def polarity(x):
    try:
        return TextBlob(x).sentiment.polarity
    except:
        return 10
#10 is placeholder in case of an error

In [9]:
df['subjectivity'] = df['text'].apply(lambda x: sentiment(x))

df['polarity'] = df['text'].apply(lambda x: polarity(x))

In [10]:
df.index = range(df.shape[0])
df.head()

Unnamed: 0,id,keyword,location,text,target,subjectivity,polarity
0,1,,,Our Deeds are the Reason of this earthquake M...,1,0.0,0.0
1,4,,,Forest fire near La Ronge Sask Canada,1,0.4,0.1
2,5,,,All residents asked to shelter in place are ...,1,0.3875,-0.01875
3,6,,,people receive wildfires evacuation or...,1,0.0,0.0
4,7,,,Just got sent this photo from Ruby Alaska as ...,1,0.0,0.0


___

## CountVectorization

In [11]:
#using countervectorizer to tokenize our texts into collection of words 
cvec = CountVectorizer(stop_words = 'english', min_df = 10, max_df = 1.0,
                       ngram_range = (1,2))
words_matrix = cvec.fit_transform(df['text'])

In [12]:
# Saving the cvec as a pickle
filename = '../Assets/cvec.pkl'
pickle.dump(cvec, open(filename, 'wb'))

In [13]:
#creating a datframe from all tokenized words  
words_df = pd.DataFrame(words_matrix.toarray(), columns=cvec.get_feature_names())

In [14]:
print(words_df.shape)
print(df[['subjectivity', 'polarity']].shape)

(7613, 1707)
(7613, 2)


In [15]:
words_df['subjectivity'] = df['subjectivity']
words_df['polarity'] = df['polarity']
words_df['target'] = df['target']

In [16]:
words_df[['subjectivity', 'polarity','target']].corr()

Unnamed: 0,subjectivity,polarity,target
subjectivity,1.0,0.176126,-0.079376
polarity,0.176126,1.0,-0.093827
target,-0.079376,-0.093827,1.0


In [17]:
words_df.head()

Unnamed: 0,aba,aba woman,abandoned,abc,abc news,ablaze,able,absolutely,accident,according,...,youtube,youtube playlist,youtube video,yr,yr old,yyc,zone,subjectivity,polarity,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.4,0.1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.3875,-0.01875,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0.0,1


## TFIDFVectorizer

In [27]:
#using countervectorizer to tokenize our texts into collection of words 
tfidf = TfidfVectorizer(stop_words = 'english', min_df = 15, max_df = 1.0,
                       ngram_range = (1,2))
words_matrix1 = tfidf.fit_transform(df['text'])

In [37]:
# Saving tfidf as a pickle
filename = '../Assets/tfidf.pkl'
pickle.dump(tfidf, open(filename, 'wb'))

In [28]:
#creating a datframe from all tokenized words  
words_df1 = pd.DataFrame(words_matrix1.toarray(), columns=tfidf.get_feature_names())

In [29]:
words_df1['subjectivity'] = df['subjectivity']
words_df1['polarity'] = df['polarity']
words_df1['target'] = df['target']

In [30]:
words_df1.shape

(7613, 1109)

## Saving to csv

In [32]:
words_df.to_csv('../Data/words_df.csv', index = False)
words_df1.to_csv('../Data/words_df1.csv', index = False)