##Connect to drive

In [6]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [7]:
!pwd
# Change directory
%cd ../content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
!pwd

/content
/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0


## Libraries

In [116]:
import pickle
import pandas as pd
import statsmodels.api as sm
import numpy as np
import ast

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Import util functions

In [32]:
!pip install import-ipynb
import import_ipynb

#!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting import-ipynb
  Downloading import_ipynb-0.1.4-py3-none-any.whl (4.1 kB)
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 5.1 MB/s 
Installing collected packages: jedi, import-ipynb
Successfully installed import-ipynb-0.1.4 jedi-0.18.1


In [33]:
%cd ./utils
# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
module = drive.CreateFile({'id':'1pegou5Ag0hDIKk6Y1wS9uwegE6oBTfZJ'})
module.GetContentFile('DataPrep.ipynb')
import DataPrep 

/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0/utils
importing Jupyter notebook from DataPrep.ipynb


In [34]:
%cd /content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
import sys
sys.path.append('/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0/utils')
execfile('utils/test.py')
hello()

/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
Hello world


In [35]:
# Create data folder
#%mkdir data

## Load data & model

In [50]:
test = pd.read_csv('data/test.csv')
posneg = pd.read_csv("data/posneg.csv")
# read in logreg model
logreg = pickle.load(open('logreg_model.sav', 'rb'))
# read in NB parameters
nb = []
with open(r'NB_list.txt','r') as fp:
  for line in fp:
    x = line[:-1]
    nb.append(x)

In [123]:
logprior = float(nb[0])
loglikelihood = ast.literal_eval(nb[1])

## Score

### Preprocess data

In [39]:
sent = test[['text']].copy()

In [44]:
my_stopwords = stopwords.words("english")
my_stopwords = my_stopwords + ['I','A','the','The','a']
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@#'

In [48]:
# Apply data preparation functions from utils
sent['clean_tweet'] = sent.text.apply(DataPrep.clean_tweet, args=[my_punctuation,my_stopwords,word_rooter])
sent['clean_tweet_root'] = sent.text.apply(DataPrep.clean_tweet, args=[my_punctuation,my_stopwords,word_rooter],root=True)
# replace nan with empty string
sent['clean_tweet'] = sent.clean_tweet.replace(np.nan, '') 
sent['clean_tweet_root'] = sent.clean_tweet_root.replace(np.nan, '') 
sent.head(5)

Unnamed: 0,text,clean_tweet,clean_tweet_root
0,Just happened a terrible car crash,happened terrible car crash,happen terribl car crash
1,"Heard about #earthquake is different cities, s...",heard earthquake different cities stay safe ev...,heard earthquak differ citi stay safe everyon
2,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...,forest fire spot pond gees flee across street ...
3,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting spokane wildfires,apocalyps light spokan wildfir
4,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills china taiwa,typhoon soudelor kill china taiwa


In [79]:
# Initiate df for freqs count within sentence
posneg_sent = pd.DataFrame(columns=['bias','neg','pos'])

# Sum pos/neg frequencies by sentence
for i in range(len(sent)):
  s = sent.loc[i, 'clean_tweet_root']
  pos = 0
  neg = 0
  if len(s)>0:
    for w in word_tokenize(s):
      if w in list(posneg.word):
        neg += int(posneg[posneg.word==w].neg)
        pos += int(posneg[posneg.word==w].pos)

  posneg_sent.loc[i,'neg'] = neg
  posneg_sent.loc[i,'pos'] = pos

posneg_sent.bias = 1
print(posneg_sent.head())

   bias  neg  pos
0     1  121  222
1     1  136  144
2     1  191  422
3     1   62  103
4     1   43  262


### LogReg

In [93]:
x = np.array(posneg_sent.values, dtype=int)
lr_pred = pd.DataFrame(columns = ['id','target'])
lr_pred['id'] = test.id

In [None]:
pred = logreg.predict(sm.add_constant(x))
th = 0.5
lr_pred['target'] = [1 if pred[i]>th else 0 for i in range(len(pred))]

lr_pred.head()

In [124]:
# Save
lr_pred.to_csv('data/lr_pred.csv',index=False)

### NaiveBayes

In [105]:
# Predict target oucome
def naive_bayes_predict(tweet, logprior, loglikelihood):
    # process the tweet to get a list of words
    word_l = word_tokenize(tweet)

    # initialize probability to zero
    p = 0
    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    return p

In [121]:
nb_pred = pd.DataFrame(columns = ['id','target'])
nb_pred['id'] = test.id
# Score
nb_pred['target'] = [1 if naive_bayes_predict(sent.iloc[i].clean_tweet_root, logprior, loglikelihood)>0 else 0 for i in range(len(sent))]

In [122]:
nb_pred.to_csv('data/nb_pred.csv',index=False)