In [14]:
import tensorflow as tf 
import pandas as pd
import numpy as np 
from joblib import dump, load
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import shuffle

In [2]:
data = pd.read_csv('./data.csv',  encoding='ISO-8859-1', names=['sentiment', 'ids', 'date','flag', 'user','text'])
#(0 = negative, 2 = neutral, 4 = positive)
data = data.dropna()
data = data.drop(columns=['ids','date','flag','user'])
# print(len(data))
data.sentiment = data.sentiment.replace(4,1)
data = shuffle(data, random_state=22)
data.head()
# data.to_csv('./dataFinal.csv', index= False)
# data.sentiment.value_counts()

Unnamed: 0,sentiment,text
1013875,1,#followfriday @foyboy 1) She had Skips on her ...
1064419,1,@PaulaAbdul we are finally starting to see the...
290763,0,w. all my old coworkers going to the wake RIP...
770144,0,"yeah, Bravo TV. I so wanna learn more about th..."
1266526,1,"@JCTrick Thanks, J! I think I'm almost getting..."


In [102]:

used_data_size = int(len(data) * .9)
# print(used_data_size)
used_data = data[:used_data_size]
# print(len(used_data))
type(used_data)

pandas.core.frame.DataFrame

In [103]:
tags_split = [str(tags) for tags in used_data['sentiment'].values]
# print(tags_split)

In [104]:
tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(tags_split)
# print(tags_encoded)
num_tags = len(tags_encoded[0])
# print(num_tags)
print(data['text'].values[0])
print(tag_encoder.classes_)
print(tags_encoded[0])

#followfriday @foyboy 1) She had Skips on her face earlier this week, 2) Name-checks me on the telly in a grand, spectacular fashion 
['0' '1']
[0 1]


In [105]:
# Split our data into train and test sets
train_size = int(len(used_data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(used_data) - train_size))

Train size: 1152000
Test size: 288000


In [106]:
# Split our labels into train and test sets
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]


In [107]:
%%writefile preprocess.py

# Pre-processing data: create our tokenizer class

from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
  def __init__(self, vocab_size):
    self._vocab_size = vocab_size
    self._tokenizer = None
  
  def create_tokenizer(self, text_list):
    tokenizer = text.Tokenizer(num_words=self._vocab_size)
    tokenizer.fit_on_texts(text_list)
    self._tokenizer = tokenizer

  def transform_text(self, text_list):
    text_matrix = self._tokenizer.texts_to_matrix(text_list)
    return text_matrix

Overwriting preprocess.py


In [108]:
# Create vocab from training corpus
from preprocess import TextPreprocessor

VOCAB_SIZE=25 # This is a hyperparameter, try out different values for your dataset

train_qs = used_data['text'].values[:train_size]
test_qs = used_data['text'].values[train_size:]

processor = TextPreprocessor(VOCAB_SIZE)
processor.create_tokenizer(train_qs)

body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)


In [109]:
# Preview the first input from our training data
print(len(body_train[0]))
print(body_train[0])

25
[0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0.]


In [110]:
# Save the processor state of the tokenizer
import pickle

with open('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)

In [111]:
def create_model(vocab_size, num_tags):
  
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
  model.add(tf.keras.layers.Dense(25, activation='relu'))
#   model.add(tf.keras.layers.Dense(20, activation='relu'))
  model.add(tf.keras.layers.Dense(num_tags, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [112]:
model = create_model(VOCAB_SIZE, num_tags)
model.summary()

# Train and evaluate the model
model.fit(body_train, train_tags, epochs=3, batch_size=64, validation_split=0.1)
print('Eval loss/accuracy:{}'.format(
  model.evaluate(body_test, test_tags, batch_size=64)))

# Export the model to a file
model.save('keras_saved_model.h5')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                1300      
_________________________________________________________________
dense_1 (Dense)              (None, 25)                1275      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 52        
Total params: 2,627
Trainable params: 2,627
Non-trainable params: 0
_________________________________________________________________
Train on 1036800 samples, validate on 115200 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Eval loss/accuracy:[0.6527961039145788, 0.6158663194444445]


In [113]:
%%writefile model_prediction.py

# Use custom model prediction to save our model + tokenizer


import pickle
import os
import numpy as np

class CustomModelPrediction(object):

  def __init__(self, model, processor):
    self._model = model
    self._processor = processor
  
  def predict(self, instances, **kwargs):
    preprocessed_data = self._processor.transform_text(instances)
    predictions = self._model.predict(preprocessed_data)
    return predictions.tolist()

  @classmethod
  def from_path(cls, model_dir):
    import tensorflow.keras as keras
    model = keras.models.load_model(
      os.path.join(model_dir,'keras_saved_model.h5'))
    with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
      processor = pickle.load(f)

    return cls(model, processor)
 

Overwriting model_prediction.py


In [13]:
test_requests = ["i am not having a nice day"]

In [117]:
from model_prediction import CustomModelPrediction

classifier = CustomModelPrediction.from_path('.')
results = classifier.predict(test_requests)
print(results)

for i in range(len(results)):
  print('Predicted labels:')
  for idx,val in enumerate(results[i]):
    if val > 0.5:
      print(tag_encoder.classes_[idx])
  print('\n')

[[0.7188213467597961, 0.280379056930542]]
Predicted labels:
0




In [2]:
data1 = pd.read_csv('./data.csv',  encoding='ISO-8859-1', names=['sentiment', 'ids', 'date','flag', 'user','text'])
#(0 = negative, 2 = neutral, 4 = positive)
data1 = data1.dropna()
data1 = data1.drop(columns=['ids','date','flag','user'])
# print(len(data))
data1.sentiment = data1.sentiment.replace(4,1)
data1 = shuffle(data1, random_state=22)
data1.head()
# data.to_csv('./dataFinal.csv', index= False)
# data.sentiment.value_counts()

In [3]:
data1 = pd.read_csv('./clean_tweet.csv', index_col=0)
data1 = data1.dropna()
data1.target = data1.target.replace(4,1)
data1 = shuffle(data1, random_state=22)
data1.head()

  mask |= (ar1 == a)


Unnamed: 0,text,target
1200205,i m honored my friend have a great week,1
1097971,hahas gotta change airline,1
306692,grrrrrrrr my picture isn t showing up,0
94044,el pollo loco commercials gross me out i can j...,0
1211014,see wouldnt it have been easier if you were on...,1


In [4]:
import re
def preprocess(tweet):
    #Preprocess the text in a single tweet
    #arguments: tweet = a single tweet in form of string 
    #convert the tweet to lower case
    tweet.lower()
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet



data1['text'] = data1['text'].apply(preprocess)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data1['text'],data1['target'],test_size=0.2,shuffle=True)

tfidf = TfidfVectorizer(sublinear_tf=True, stop_words = "english", lowercase=True, max_features=15000)
coded = tfidf.fit(X_train)
coded1 = tfidf.transform(X_train)

In [6]:
coded1.shape

(1277397, 10000)

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier() 
knn.fit(coded1,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
sv = SVC()
sv.fit(coded1,y_train)

In [10]:
# %%timeit
lg = LogisticRegression()
lg.fit(coded1, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [7]:
# %%timeit
nbmodel = MultinomialNB(alpha=10, fit_prior=False)
nbmodel.fit(coded1, y_train)

MultinomialNB(alpha=10, class_prior=None, fit_prior=False)

In [11]:
# prediction = nbmodel.predict(tfidf.transform(X_test))
prediction = lg.predict(tfidf.transform(X_test))
# prediction = knn.predict(tfidf.transform(X_test))

In [12]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(accuracy_score(y_test,prediction))

0.7749898230781275


In [32]:
q = tfidf.transform(['i am not feeling ok today'])
prediction = model.predict(q)
print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))

             precision    recall  f1-score   support

          0       0.75      0.79      0.77    159568
          1       0.78      0.74      0.76    159782

avg / total       0.77      0.77      0.77    319350

[[125888  33680]
 [ 41060 118722]]


In [10]:
from sklearn.model_selection import GridSearchCV
gcv=GridSearchCV(model,{'alpha':[1.5,2,3,4,10,100,1.0,0.1,0.001,0.0001],'fit_prior':[True,False]})
gcv.fit(coded,y_train)

TypeError: Singleton array array(TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None), dtype=object) cannot be considered a valid collection.

In [31]:
print(gcv.best_score_,gcv.best_params_)

0.76691328125 {'alpha': 10, 'fit_prior': False}


In [30]:

dump(nbmodel, 'MultinomialNB.joblib') 
# clf = load('MultinomialNB.joblib')


['MultinomialNB.joblib']

In [31]:
# import pickle
# pickle.dump(coded, open("tfidf1.pkl", "wb"))
# pickle.dump(model, open("mod.pkl", "wb"))
dump(coded, open("tfidf1.pkl", "wb"))

In [39]:
q = tfidf.transform(['i am not feeling ok today'])
prediction = clf.predict(q)


In [41]:
prediction[0]

0