# Import libraries

In [1]:
import pandas as pd
import numpy as np
import random
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from nltk.tag import pos_tag
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re, string
from nltk.corpus import stopwords
from nltk import NaiveBayesClassifier
from nltk import classify
from sklearn.model_selection import train_test_split

In [2]:
save_classifier = True
filename = 'Naive_Bayes_NLTK_v1'

# Extracting data

In [5]:
data=pd.read_csv(r'C:\Users\dell\Pictures\Tweets\data\train.csv')

In [6]:
data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [7]:
data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


let's fill null values

In [8]:
keyword_filled = data.keyword.fillna('')
location_filled = data.location.fillna('')
data['all_words']= data.text+" "+keyword_filled+' '+location_filled

In [9]:
data

Unnamed: 0,id,keyword,location,text,target,all_words
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...,1,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby #Alaska as ...
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,Two giant cranes holding a bridge collapse int...
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,@aria_ahrary @TheTawniest The out of control w...
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611,10872,,,Police investigating after an e-bike collided ...,1,Police investigating after an e-bike collided ...


In [10]:
def check(X):
    rating = pd.to_numeric(X['target'])
    positive = X['all_words'][rating==1]
    negative = X['all_words'][rating==0]
    print("Disaster", len(positive))
    print("---------------------")
    print("No Disaster", len(negative))
    return positive, negative

In [11]:
positive, negative = check(data)

Disaster 3271
---------------------
No Disaster 4342


In [12]:
positive

0       Our Deeds are the Reason of this #earthquake M...
1                Forest fire near La Ronge Sask. Canada  
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: all_words, Length: 3271, dtype: object

In [13]:
def tokening(positive, negative):
    positive_tokens = list()
    negative_tokens = list()
    tweet = TweetTokenizer()
    
    for pos in positive:
        add_pos_sent = tweet.tokenize(pos)
        positive_tokens.append(add_pos_sent)

        
    for neg in negative:
        add_neg_sent = tweet.tokenize(neg)
        negative_tokens.append(add_neg_sent)
    return positive_tokens, negative_tokens

In [14]:
positive_tokens, negative_tokens = tokening(positive, negative)

# Lemmatize sentence

In [15]:
stop_words = stopwords.words('english')
add_words = ["...", "'"]
stop_words =stop_words +add_words

In [16]:
# Function to lemmitize and clean words
def cleaned_words (tokens, stop_words): # lemmatize sentence, omit punctuation and stop words
    cleaned_tokens = []
    for token, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else: # all the rest tagged with a
            pos = 'a'
        lemmatizer = WordNetLemmatizer()
        word = lemmatizer.lemmatize(token, pos)
        
        if len(word) > 0 and word not in string.punctuation and word.lower() not in stop_words:
            cleaned_tokens.append(word.lower())
    return cleaned_tokens

In [17]:
positive_cleaned_tokens_list=list()
negative_cleaned_tokens_list=list()
for tokens in positive_tokens:
    positive_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

for tokens in negative_tokens:
    negative_cleaned_tokens_list.append(cleaned_words(tokens, stop_words))

In [18]:
print(negative_cleaned_tokens_list[:10])

[["what's", 'man'], ['love', 'fruit'], ['summer', 'lovely'], ['car', 'fast'], ['goooooooaaaaaal'], ['ridiculous'], ['london', 'cool', ';)'], ['love', 'skiing'], ['wonderful', 'day'], ['looooool']]


In [19]:
def get_all_words(cleaned_list):
    all_words = []
    for tokens in cleaned_list:
        for token in tokens:
            all_words.append(token)
    return all_words


all_neg_words = get_all_words(negative_cleaned_tokens_list)
all_pos_words = get_all_words(positive_cleaned_tokens_list)


In [20]:
from nltk import FreqDist

In [21]:
freq_pos = FreqDist(all_pos_words)
freq_neg = FreqDist(all_neg_words)

In [22]:
print(freq_pos.most_common(10))
print(freq_neg.most_common(10))


[('\x89', 383), ('fire', 291), ('suicide', 204), ('û_', 171), ('bomb', 164), ('california', 147), ('crash', 143), ('flood', 142), ('kill', 141), ('building', 137)]
[('\x89', 442), ('new', 301), ('get', 298), ('like', 295), ('body', 216), ("i'm", 207), ('go', 190), ('û_', 171), ('scream', 152), ('wreck', 141)]


In [23]:
def get_dict(cleaned_list):
    for list_tokens in cleaned_list:
        yield dict([token, True] for token in list_tokens)

In [24]:
positive_tokens_for_model = get_dict(positive_cleaned_tokens_list)
negative_tokens_for_model = get_dict(negative_cleaned_tokens_list)

In [25]:
pos_dataset = [(dict_word, 1) 
               for dict_word in positive_tokens_for_model]

neg_dataset = [(dict_word, 0)
              for dict_word in negative_tokens_for_model]

In [26]:
all_set = pos_dataset + neg_dataset
random.shuffle(all_set)

train_set, test_set = train_test_split(all_set, test_size=0.33, random_state=42)

In [27]:
classifier = NaiveBayesClassifier.train(train_set)
print(classifier.labels())

[0, 1]


In [28]:
print("accuracy is", classify.accuracy(classifier, test_set))

accuracy is 0.766016713091922


In [29]:
print("most informative:", classifier.show_most_informative_features(10))

Most Informative Features
               hiroshima = True                1 : 0      =     41.1 : 1.0
                  atomic = True                1 : 0      =     29.6 : 1.0
                   spill = True                1 : 0      =     27.8 : 1.0
                 typhoon = True                1 : 0      =     27.8 : 1.0
                 suicide = True                1 : 0      =     25.9 : 1.0
                 20spill = True                1 : 0      =     24.3 : 1.0
                 bombing = True                1 : 0      =     21.6 : 1.0
                outbreak = True                1 : 0      =     21.6 : 1.0
                 reunion = True                1 : 0      =     19.0 : 1.0
most informative: None


# Confusion Matrix

In [30]:
from nltk.metrics import ConfusionMatrix


In [31]:
test_tag = [tag[0] for tag in test_set]
test_label = [sent[1] for sent in test_set]

In [32]:
model_label = classifier.classify_many(test_tag)
cm = ConfusionMatrix(test_label, model_label)
print(cm.pretty_format(sort_by_count = True, show_percents = True, truncate = 9))

  |      0      1 |
--+---------------+
0 | <42.5%> 14.7% |
1 |   8.7% <34.1%>|
--+---------------+
(row = reference; col = test)



In [33]:
if save_classifier == True:
    import pickle 
    seperator = ''
    filename = seperator.join([filename, '.pickle'])
    f = open(filename, 'wb')
    pickle.dump(classifier, f)
    f.close

In [34]:
def import_classifier(classifier):
    import pickle
    f = open(classifier, 'rb')
    classifier = pickle.load(f)
    return classifier

In [35]:
test_new = pd.read_csv(r'C:\Users\dell\OneDrive\Desktop\Etibar M\test.csv')

In [36]:
test_new.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [37]:
def final(df):
    tweet_tokenizer = TweetTokenizer()
    
    keyword_filler = df['keyword'].fillna("")
    location_filler = df['location'].fillna("")
    df['all_words']= df.text+ " "+keyword_filler+ " "+location_filler
    test_token = list()
    cleaned_token_test = list()
    for sent in df['all_words']:
        add_sent = tweet_tokenizer.tokenize(sent)
        test_token.append(add_sent)
    for tokens in test_token:
        cleaned_token_test.append(cleaned_words(tokens, stop_words))
    tokens_for_model = get_dict(cleaned_token_test)
    test_set = [dic_word for dic_word in tokens_for_model]
    
    test_tag = [tag for tag in test_set]
    result = pd.DataFrame({'id': df.id})
    result['target'] = classifier.classify_many(test_set)
        
    return result
    
    
    
    

In [38]:
result = final(test_new)

In [39]:
result

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [40]:
result.to_csv('result_NLTK.csv', index = False)