In [65]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [71]:
# train = pd.read_csv('/Users/mtjen/Desktop/313/project/train.csv')
train = pd.read_csv('train.csv')
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [72]:
# top 10 most significant features:
# tweet length, TTR, number of punctuation marks, number of periods
# avg char per sentence, number adverbs, number of "to", number of verbs,
# number of entities, tweet entropy



In [75]:
# Remove all non english words from tweets
# (trying to get rid of usernames and junk)
import nltk
# nltk.download('words')
words = set(nltk.corpus.words.words())

for tweet_index in range(len(train['text'])):
    tweet = train.iloc[tweet_index][3]
    train.iat[tweet_index, 3] = " ".join(w for w in nltk.wordpunct_tokenize(tweet) if w.lower() in words or not w.isalpha())


In [89]:
# Remove all data with no content in 'text'
empty = np.where(train['text'] == '')
print(empty)
train.drop(train.index[empty], inplace=True, axis=0)
print(train.shape)
train = train.assign(Index=range(len(train))).set_index('Index')
print(train[23:25])

(array([], dtype=int64),)
(7610, 5)
       id keyword location                           text  target
Index                                                            
23     34     NaN      NaN         What a wonderful day !       0
24     37     NaN      NaN  No way ... I can ' t eat that       0


In [90]:
# Create Features
charsToCheck = ['!', '@', '#', '?', '.', ',', 'http']
vowels = {'a', 'e', 'i', 'o', 'u', 'A', 'E', 'I', 'O', 'U'}

hasLocation = []
tweetNumberOfChars = []
specialCharacters = []
numberOfWords = []
avgCharsPerWord = []
numNumericTweet = []
numLettersTweet = []
numUpperTweet = []
numVowelsTweet = []
numConsonantsTweet = []
avgCharsPerSentence = []

for index in range(len(train)):
    ##### location
    location = 0
    if pd.isnull(train['location'][index]) == False:
            location = 1
    
    ###### tweet
    text = train['text'][index]
    # number of characters in twets
    numCharsTweet = len(text)
    
    # number of specific special characters in tweet
    specialChars = []
    for specialChar in charsToCheck:
        numSpecialChar = text.count(specialChar)
        specialChars.append(numSpecialChar)
    
    # average characters per word
    words = text.split()
    lenWords = []
    numNumeric = 0
    numLetters = 0
    numUpper = 0
    numVowels = 0
    numConsonants = 0
    for word in words:
        wordLength = len(word)
        lenWords.append(wordLength)
        
        # number of letters/numbers, uppercase, vowels, consonants
        for char in word:
            if char.isnumeric():
                numNumeric += 1
            if char.isalpha():
                numLetters += 1
                if char.isupper():
                    numUpper += 1
                if char in vowels:
                    numVowels += 1
                else:
                    numConsonants += 1
        
    wordLengthAvg = np.mean(lenWords)
    
    # number of words
    numWords = len(words)
    
    hasLocation.append(location)
    tweetNumberOfChars.append(numCharsTweet)
    specialCharacters.append(specialChars)
    numberOfWords.append(numWords)
    avgCharsPerWord.append(wordLengthAvg)
    numNumericTweet.append(numNumeric)
    numLettersTweet.append(numLetters)
    numUpperTweet.append(numUpper)
    numVowelsTweet.append(numVowels)
    numConsonantsTweet.append(numConsonants)

    
# specials
numEx = []
numAt = []
numHash = []
numQ = []
numPeriod = []
numComma = []
numLinks = []
numPunc = []

for tweetCharacters in specialCharacters:
    totalPunc = 0
    for index in range(len(charsToCheck)):
        value = tweetCharacters[index]
        totalPunc += value
        if index == 0:
            numEx.append(value)
        elif index == 1:
            numAt.append(value)
        elif index == 2:
            numHash.append(value)
        elif index == 3:
            numQ.append(value)
        elif index == 4:
            numPeriod.append(value)
        elif index == 5:
            numComma.append(value)
        elif index == 6:
            numLinks.append(value)
    numPunc.append(totalPunc)

In [91]:
# create new dataframe to hold features
colNames = ['hasLocation', 'tweetNumberOfChars', 'numberOfWords', 'numEx', 'numAt', 
            'numHash', 'numQ', 'numPeriod', 'numComma', 'numLinks', 'numPunc',
            'avgCharsPerWord', 'numNumericTweet', 'numLettersTweet', 'numUpperTweet',
            'numVowelsTweet', 'numConsonantsTweet']
colValues = [hasLocation, tweetNumberOfChars, numberOfWords, numEx, numAt, numHash,
             numQ, numPeriod, numComma, numLinks, numPunc, avgCharsPerWord, numNumericTweet, 
             numLettersTweet, numUpperTweet, numVowelsTweet, numConsonantsTweet]

data = pd.DataFrame()
for index in range(len(colNames)):
    colName = colNames[index]
    colVals = colValues[index]
    data[colName] = colVals

    
data['target'] = train['target']

data

Unnamed: 0,hasLocation,tweetNumberOfChars,numberOfWords,numEx,numAt,numHash,numQ,numPeriod,numComma,numLinks,numPunc,avgCharsPerWord,numNumericTweet,numLettersTweet,numUpperTweet,numVowelsTweet,numConsonantsTweet,target
0,0,58,12,0,0,1,0,0,0,0,1,3.916667,0,46,4,21,25,1
1,0,28,6,0,0,0,0,1,0,0,1,3.833333,0,22,3,10,12,1
2,0,95,20,0,0,0,0,1,0,0,1,3.800000,0,73,2,32,41,1
3,0,39,8,0,0,1,0,0,1,0,2,4.000000,5,25,0,14,11,1
4,0,66,15,0,0,2,0,0,0,0,2,3.466667,0,50,2,17,33,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7605,0,68,13,0,0,0,0,1,0,0,1,4.307692,1,50,7,15,35,1
7606,0,100,21,0,2,0,0,2,0,0,4,3.809524,0,75,3,28,47,1
7607,0,60,18,0,0,0,1,3,0,0,4,2.388889,9,23,6,6,17,1
7608,0,109,23,0,0,0,0,2,0,0,2,3.782609,0,82,3,36,46,1


In [92]:
# Vectorize contents of all tweets
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pandas as pd

countvectorizer = CountVectorizer(analyzer ='word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')

count_wm = countvectorizer.fit_transform(train['text'])
tfidf_wm = tfidfvectorizer.fit_transform(train['text'])

#retrieve the terms found in the corpora
count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()

df_countvect = pd.DataFrame(data = count_wm.toarray(),index = list(train.index.values) ,columns = count_tokens)
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = list(train.index.values) ,columns = tfidf_tokens)

print("Count Vectorizer\n")
print(df_countvect)
print("\nTD-IDF Vectorizer\n")
print(df_tfidfvect)

Count Vectorizer

      00  000  0000  007npen6lg  00cy9vxeff  00end  00pm  01  02  0215  ...  \
0      0    0     0           0           0      0     0   0   0     0  ...   
1      0    0     0           0           0      0     0   0   0     0  ...   
2      0    0     0           0           0      0     0   0   0     0  ...   
3      0    1     0           0           0      0     0   0   0     0  ...   
4      0    0     0           0           0      0     0   0   0     0  ...   
...   ..  ...   ...         ...         ...    ...   ...  ..  ..   ...  ...   
7605   0    0     0           0           0      0     0   0   0     0  ...   
7606   0    0     0           0           0      0     0   0   0     0  ...   
7607   0    0     0           0           0      0     0   1   0     0  ...   
7608   0    0     0           0           0      0     0   0   0     0  ...   
7609   0    0     0           0           0      0     0   0   0     0  ...   

      û_1  û_ahhh  û_https  û_one

In [93]:
# Remove entries with junk characters
# dont really know why they show up
def isascii(s):
    """Check if the characters in string s are in ASCII, U+0-U+7F."""
    return len(s) == len(s.encode())

for col in list(df_countvect.columns):
    if isascii(col) == False:
        df_countvect = df_countvect.drop([col], axis=1)
        df_tfidfvect = df_tfidfvect.drop([col], axis=1)

print(df_countvect.shape)
print(df_tfidfvect.shape)

(7610, 11610)
(7610, 11610)


In [94]:
# Save cleaned vectorized tweets to csv
# print(df_tfidfvect.iloc[0])
# df_tfidfvect.to_csv('vectorized_tweets.csv')

In [105]:
# Get TTR (text to token ratio)
# occurances = list()
# for tweet in range(len(df_tfidfvect)):
#     count = 0
#     for word in df_tfidfvect.iloc[tweet]:
#         if word != 0:
#             count+=df_tfidfvect.iloc[tweet]
#     occurances.append(count)

# count = np.sum(df_tfidfvect, axis=1).tolist()

In [106]:
# ttr = []
# for i in range(len(occurances)):
#     if occurances[i] ==0:
#         ttr.append(0)
#         continue
#     ttr.append(occurances[i]/count[i])
# print(ttr)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
# Make heatmap to visualize
import seaborn as sns
sns.heatmap(features.corr(), cmap='BrBG')

In [11]:
# split to train and test data
trainData = data.sample(frac = 0.8, random_state = 25)
testData = data.drop(trainData.index)

stopIndex = data.shape[1] - 1

trainArray = trainData.values
trainX = trainArray[:,0:stopIndex]
trainY = trainArray[:,stopIndex]

testArray = testData.values
testX = testArray[:,0:stopIndex]
testY = testArray[:,stopIndex]

In [12]:
# Run random trees
model = RandomForestClassifier()
model.fit(trainX, trainY)
model.score(testX, testY)

0.6999343401181878

(7613, 21362)
