In [1]:
# IMPORTS

import pandas as pd
import re


In [13]:
# LOAD DATA INTO TRAINING SET
training_set = pd.read_csv('train', delimiter=',',
                           header=None, names=['id', 'keyword', 'location', 'text', 'target'])

training_set.head()

Unnamed: 0,id,location,text,target
0,1,Chicago,the earthquake shook the ocean,1
1,2,Japan,an earthquake is coming,0
2,3,London,what a nice tsunami!,0
3,4,Tokyo,the earthquake grew more intense,1
4,5,Africa,the city was destroyed by an earthquake,1


In [14]:
# FUNCTION TO CLEAN TEXT IN A TWEET

def clean(txt):
    # Remove special characters and convert to lowercase
    return re.sub('\W', ' ', txt).lower()


In [16]:
#DROP ANY COLUMNS THAT WE DON'T NEED
training_set.drop(['id', 'location'], axis=1, inplace=True)


In [17]:
# CLEAN DATA

training_set.text = training_set.text.apply(clean)
training_set.to_csv('cleandata', index=False)

training_set.head()


Unnamed: 0,text,target
0,the earthquake shook the ocean,1
1,an earthquake is coming,0
2,what a nice tsunami,0
3,the earthquake grew more intense,1
4,the city was destroyed by an earthquake,1


In [21]:
# COUNT WORD OCCURRENCES IN FAKE AND REAL TWEETS

table = {
    0: {},
    1: {}
}

# For each row in training set
for row in training_set.itertuples():
    # Split tweet string into an array of words
    words = row.text.split()

    # For each word in tweet
    for word in words:
        # Add word to occurrence table if not exist
        # Default value is 1 to avoid "the 0 problem" (Laplace smoothing)
        if word not in table[0]:
            table[0][word] = 1
            table[1][word] = 1

        # Increment word occurrence by target
        table[row.target][word] += 1

table = pd.DataFrame(table)
table


Unnamed: 0,0,1
the,1,6
earthquake,2,4
shook,1,2
ocean,1,2
an,2,2
is,2,1
coming,2,1
what,2,1
a,2,2
nice,2,1


In [22]:
# CALCULATE CONSTANTS

# Probability that a tweet is fake or real disaster
p = training_set.target.value_counts(normalize=True).to_dict()

# Word count in fake and real tweets
n = table.sum().to_dict()

pd.DataFrame({
    'words': n,
    'probability': p
}).head()


Unnamed: 0,words,probability
0,30,0.333333
1,46,0.666667


In [28]:
# CALCULATE WORD OCCURRENCE PROBABILITY IN FAKE AND REAL TWEETS DISASTER

p_table = pd.DataFrame({
  0: table[0].apply(lambda x: x / n[0]),
  1: table[1].apply(lambda x: x / n[1])
})

p_table.head()


Unnamed: 0,0,1
our,0.000663,0.0005
deeds,2.1e-05,2.5e-05
are,0.002547,0.002101
the,0.020482,0.017068
reason,0.00015,0.0001


In [29]:
# CLASSIFY A NEW TWEETS

def is_fake(sms):
    words = clean(sms).split()

    p_fake_given_message = p[0]
    p_real_given_message = p[1]

    for word in words:
        if word not in p_table[0]:
            continue
        p_fake_given_message *= p_table[0][word]
        p_real_given_message *= p_table[1][word]
    return p_fake_given_message > p_real_given_message

In [45]:
#LOADING TESTING DATA IN testing.csv
testing_set = pd.read_csv('test', delimiter=',',
                           header=None, names=['id', 'keyword', 'location', 'text'])

testing_set.head()

Unnamed: 0,id,keyword,location,text
0,id,keyword,location,text
1,0,,,Just happened a terrible car crash
2,2,,,"Heard about #earthquake is different cities, s..."
3,3,,,"there is a forest fire at spot pond, geese are..."
4,9,,,Apocalypse lighting. #Spokane #wildfires


In [46]:
#DROP ANY COLUMS THAT WE DON'T NEED
testing_set.drop(['keyword', 'location'], axis=1, inplace=True)
testing_set.to_csv('TestData', index=False)
testing_set.head()

Unnamed: 0,id,text
0,id,text
1,0,Just happened a terrible car crash
2,2,"Heard about #earthquake is different cities, s..."
3,3,"there is a forest fire at spot pond, geese are..."
4,9,Apocalypse lighting. #Spokane #wildfires


In [47]:
#USING sample_submisstion as a result csv file
sample_submission = testing_set
sample_submission.head()

Unnamed: 0,id,text
0,id,text
1,0,Just happened a terrible car crash
2,2,"Heard about #earthquake is different cities, s..."
3,3,"there is a forest fire at spot pond, geese are..."
4,9,Apocalypse lighting. #Spokane #wildfires


In [48]:
# CLEAN DATA
sample_submission.text = sample_submission.text.apply(clean)
sample_submission.to_csv('TestData', index=False)

sample_submission.head()

Unnamed: 0,id,text
0,id,text
1,0,just happened a terrible car crash
2,2,heard about earthquake is different cities s...
3,3,there is a forest fire at spot pond geese are...
4,9,apocalypse lighting spokane wildfires


In [49]:
# CLASSIFY A CSV FILE WITH MULTILINE OF TWEETS
sample_submission['target'] = " "
for index, row in sample_submission.iterrows():
    result = is_fake(row['text'])
    target = 0 if result else 1
    row['target'] = target
sample_submission.head()
sample_submission.to_csv('sample_submission', index=False)