In [4]:
import nltk, random
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/marci/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [6]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [7]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [9]:
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)

0.76
Most Informative Features
 contains(unimaginative) = True              neg : pos    =      8.5 : 1.0
    contains(schumacher) = True              neg : pos    =      7.1 : 1.0
     contains(atrocious) = True              neg : pos    =      6.7 : 1.0
          contains(mena) = True              neg : pos    =      6.4 : 1.0
        contains(suvari) = True              neg : pos    =      6.4 : 1.0


In [14]:
import itertools
import os
import re, string

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix


train = pd.read_csv('../input/train.csv')
print('Training data shape: ', train.shape)
test = pd.read_csv('../input/test.csv')
print('Testing data shape: ', test.shape)

# Replacing the ambigious locations name with Standard names
train['location'].replace({'United States':'USA',
                           'New York':'USA',
                            "London":'UK',
                            "Los Angeles, CA":'USA',
                            "Washington, D.C.":'USA',
                            "California":'USA',
                             "Chicago, IL":'USA',
                             "Chicago":'USA',
                            "New York, NY":'USA',
                            "California, USA":'USA',
                            "FLorida":'USA',
                            "Nigeria":'Africa',
                            "Kenya":'Africa',
                            "Everywhere":'Worldwide',
                            "San Francisco":'USA',
                            "Florida":'USA',
                            "United Kingdom":'UK',
                            "Los Angeles":'USA',
                            "Toronto":'Canada',
                            "San Francisco, CA":'USA',
                            "NYC":'USA',
                            "Seattle":'USA',
                            "Earth":'Worldwide',
                            "Ireland":'UK',
                            "London, England":'UK',
                            "New York City":'USA',
                            "Texas":'USA',
                            "London, UK":'UK',
                            "Atlanta, GA":'USA',
                            "Mumbai":"India"},inplace=True)

# Applying a first round of text cleaning techniques

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    return remove_stopwords

def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

all_word_list = []
train['text'].apply(lambda x: all_word_list.extend(text_preprocessing(x)))
train['text'] = train['text'].apply(lambda x: text_preprocessing(x))
test['text'] = test['text'].apply(lambda x: text_preprocessing(x))
print(train['text'][2])

print(all_word_list[:10])


Training data shape:  (7613, 5)
Testing data shape:  (3263, 4)


[nltk_data] Downloading package stopwords to /home/marci/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['residents', 'asked', 'shelter', 'place', 'notified', 'officers', 'evacuation', 'shelter', 'place', 'orders', 'expected']
['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us', 'forest', 'fire', 'near']


In [45]:
all_words = nltk.FreqDist(w.lower() for w in all_word_list)
word_features = list(all_words)[:2000]

print(word_features[:20])

def tweet_features(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in tweet_words)
    return features

['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us', 'forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada', 'residents', 'asked', 'shelter', 'place', 'notified', 'officers']


In [46]:
#print(train[["text", "target"]])
featuresets_pd =  train[["text", "target"]].apply(lambda textandtarget: (tweet_features(textandtarget[0]), textandtarget[1]), axis=1)
featuresets = featuresets_pd.values.tolist()
#print(featuresets[0])
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [48]:
print(nltk.classify.accuracy(classifier, test_set))
result = [classifier.classify(tweet_features(tweet)) for tweet in test['text']]

0.81


In [49]:
def submission(submission_file_path,solution):
    sample_submission = pd.read_csv(submission_file_path)
    sample_submission["target"] = solution
    sample_submission.to_csv("submission.csv", index=False)
    
submission_file_path = "../input/sample_submission.csv"

submission(submission_file_path, result)