<a href="https://colab.research.google.com/github/AmanPriyanshu/Natural-Language-Processing/blob/master/SimpleLogisticRegressionForVectorClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Downloading the dataset

In [1]:
!mkdir -p data
!wget -nc https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/sentiment140-subset.csv.zip -P data
!unzip -n -d data data/sentiment140-subset.csv.zip

File ‘data/sentiment140-subset.csv.zip’ already there; not retrieving.

Archive:  data/sentiment140-subset.csv.zip


## IMPORTS:

In [2]:
import nltk
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
from tqdm import tqdm
import string
import tensorflow as tf

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Only importing 30,000 values since this is more of practice and demo for refrence

In [3]:
df = pd.read_csv("data/sentiment140-subset.csv", nrows=30000)
print(df.head())

   polarity                                               text
0         0                      @kconsidder You never tweet  
1         0                 Sick today  coding from the couch.
2         1  @ChargerJenn Thx for answering so quick,I was ...
3         1  Wii fit says I've lost 10 pounds since last ti...
4         0  @MrKinetik Not a thing!!!  I don't really have...


### Let's count positives and negatives

In [4]:
df.polarity.value_counts()

1    15064
0    14936
Name: polarity, dtype: int64

In [5]:
df = df.values
print(df)

[[0 '@kconsidder You never tweet  ']
 [0 'Sick today  coding from the couch.']
 [1
  '@ChargerJenn Thx for answering so quick,I was afraid I was gonna crash twitter with all the spamming I did 2 RR..sorry bout that ']
 ...
 [1
  '@phnompenhpost thanks for the follow! u guys do a great job in reporting news about Cambodia...makes me proud to be cambodian ']
 [0
  "@coliwilso crapï¿½ I really wanted to make it for @minmï¿½ but I'm feeling way too tired after the whole weekend "]
 [1
  'follow friday- @theclassiccrime @jeremycamp @chris_daughtry &amp; @dannygokey ']]


In [6]:
polarity = df.T[0].flatten()
tweets = df.T[1].flatten()

In [7]:
tweets

array(['@kconsidder You never tweet  ',
       'Sick today  coding from the couch.',
       '@ChargerJenn Thx for answering so quick,I was afraid I was gonna crash twitter with all the spamming I did 2 RR..sorry bout that ',
       ...,
       '@phnompenhpost thanks for the follow! u guys do a great job in reporting news about Cambodia...makes me proud to be cambodian ',
       "@coliwilso crapï¿½ I really wanted to make it for @minmï¿½ but I'm feeling way too tired after the whole weekend ",
       'follow friday- @theclassiccrime @jeremycamp @chris_daughtry &amp; @dannygokey '],
      dtype=object)

## PREPROCESSING:

In [8]:
def stopwords_punctuation(arr):
  new_arr = []
  diction = {}
  for p in string.punctuation:
    diction.update({p:' '})
  for s in tqdm(arr):
    s = s.translate(str.maketrans(diction))
    new_arr.append(' '.join([i for i in s.split() if i not in stopwords.words('english')]))
  new_arr = np.array(new_arr)
  return new_arr

In [9]:
def stemming_lowercase(arr):
  porter = PorterStemmer()
  stemmed_arr = []
  for s in tqdm(arr):
    s = s.lower()
    stemmed_arr.append(' '.join([porter.stem(word) for word in s.split()]))
  stemmed_arr = np.array(stemmed_arr)
  return stemmed_arr

In [10]:
tweets = stopwords_punctuation(tweets)
tweets = stemming_lowercase(tweets)

100%|██████████| 30000/30000 [00:48<00:00, 613.24it/s]
100%|██████████| 30000/30000 [00:04<00:00, 6498.55it/s]


## TRAINING FREQUENCIES:

In [11]:
def generate_positive_negative_word_counts(polar_arr, str_arr):
  word_feature = {}
  for p,s in tqdm(zip(polar_arr, str_arr), total=len(str_arr)):
    s = s.split()
    for w in s:
      if w not in list(word_feature.keys()):
        word_feature.update({w: [0, 0]})
      word_feature[w][p] += 1
  return word_feature

In [12]:
word_feature = generate_positive_negative_word_counts(polarity[:int(0.5*polarity.shape[0])], tweets[:int(0.5*polarity.shape[0])])

100%|██████████| 15000/15000 [00:20<00:00, 722.43it/s]


## TWEET to REPRESENTATION

In [13]:
def tweet2rep(arr, word_feature):
  rep = []
  for s in tqdm(arr):
    p = 0
    n = 0
    for w in s.split():
      try:
        p += word_feature[w][1]
        n += word_feature[w][0]
      except:
        pass
    rep.append([1, p, n])
  rep = np.array(rep)
  return rep

In [14]:
representation = tweet2rep(tweets, word_feature)
print(representation[:5])

100%|██████████| 30000/30000 [00:00<00:00, 190249.22it/s]


[[    1   403   283]
 [    1   336   508]
 [    1  8222 11827]
 [    1  3225  4526]
 [    1  2947  4256]]


## LOGISTIC REGRESSION:

In [15]:
model = tf.keras.Sequential([
                             tf.keras.layers.Dense(2, activation='relu'), 
                             tf.keras.layers.Dense(1, activation='sigmoid'), 
                             ])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc', 'AUC'])

In [16]:
representation = representation.astype('float')
polarity = polarity.astype('float')

In [17]:
model.fit(representation, polarity, epochs=50, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f5bef48f5c0>

## DONE