In [1]:
import re
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import math
import plotly 
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn.feature_extraction.text import CountVectorizer

# my functions
import helpers.data_mining_helpers as dmh
import helpers.text_analysis as ta

# 1. Data Preparation

In [2]:
# categories
categories = ['source','sentence', 'score' ,'segment','POS']
f0 = open('amazon_cells_labelled.txt','r')
f1 = open('imdb_labelled.txt','r',encoding = 'utf8')
f2 = open('yelp_labelled.txt','r')

#check data
lines0 = f0.readlines()
print("amazon :\n",[i.split('\t') for i in lines0[:10]])

lines1 = f1.readlines()
print("\nimdb :\n",[i.split('\t') for i in lines1[:10]])

lines2 = f2.readlines()
print("\nyelp :\n",[i.split('\t') for i in lines2[:10]])


amazon :
 [['So there is no way for me to plug it in here in the US unless I go by a converter.', '0\n'], ['Good case, Excellent value.', '1\n'], ['Great for the jawbone.', '1\n'], ['Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!', '0\n'], ['The mic is great.', '1\n'], ['I have to jiggle the plug to get it to line up right to get decent volume.', '0\n'], ['If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.', '0\n'], ['If you are Razr owner...you must have this!', '1\n'], ['Needless to say, I wasted my money.', '0\n'], ['What a waste of money and time!.', '0\n']]

imdb :
 [['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ', '0\n'], ['Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ', '0\n'], ['Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridicu

In [3]:
#check all data
print("amazon dataset: ",len(lines0))
print("imdb dataset: ",len(lines1))
print("yelp dataset: ",len(lines2))
print("\nall dataset: ",len(lines0)+len(lines1)+len(lines2))

amazon dataset:  1000
imdb dataset:  1000
yelp dataset:  1000

all dataset:  3000


Converting Dictionary into Pandas dataframe

In [4]:
categories = ['source','sentence', 'score']
sen = [] 
sco = []
sou = []

In [5]:
for line in lines0:
    sen.append(line[:len(line)-3])
    sco.append(line[len(line)-2:len(line)-1])
    sou.append("amazon")
    
for line in lines1:
    sen.append(line[:len(line)-3])
    sco.append(line[len(line)-2:len(line)-1])
    sou.append("imdb")
    
for line in lines2:
    sen.append(line[:len(line)-3])
    sco.append(line[len(line)-2:len(line)-1])
    sou.append("yelp")

In [6]:
#while True:
#    try:
#        Stop_words
#        stop_words = set(stopwords.words('english'))
#        stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '...', "'s",'/','-','&','``','--',"''",""])
#        for sentence in str:
#             word_segment.append(list(set(word_tokenize(sentence.lower()))-set(stop_words)))
#             pos_token.append(nltk.pos_tag(set(word_tokenize(sentence.lower()))-set(stop_words),lang='eng'))
#        for pos_ in pos_token:
#            pos.append(list([pos for word,pos in pos_]))
#        break;
#    except: 
#        nltk.download()
#        continue

# 2. Converting Dictionary into Pandas dataframe

In [7]:
data_dict = {
             "source": sou,
             "sentence": sen,
             "score": sco,
            }
all_data = pd.DataFrame(data_dict,columns=['source','sentence','score'])

In [8]:
len(all_data)

3000

### 2.1 Familiarizing Data

In [9]:
#first 5 data
all_data[:5]

Unnamed: 0,source,sentence,score
0,amazon,So there is no way for me to plug it in here i...,0
1,amazon,"Good case, Excellent value.",1
2,amazon,Great for the jawbone.,1
3,amazon,Tied to charger for conversations lasting more...,0
4,amazon,The mic is great.,1


In [10]:
#last 5 data
all_data[-5:]

Unnamed: 0,source,sentence,score
2995,yelp,I think food should have flavor and texture an...,0
2996,yelp,Appetite instantly gone.,0
2997,yelp,Overall I was not impressed and would not go b...,0
2998,yelp,"The whole experience was underwhelming, and I ...",0
2999,yelp,"Then, as if I hadn't wasted enough of my life ...",0


In [11]:
#search imdb data
all_data[all_data['source'] =="imdb"][:5]

Unnamed: 0,source,sentence,score
1000,imdb,"A very, very, very slow-moving, aimless movie ...",0
1001,imdb,Not sure who was more lost - the flat characte...,0
1002,imdb,Attempting artiness with black & white and cle...,0
1003,imdb,Very little music or anything to speak of.,0
1004,imdb,The best scene in the movie was when Gerardo i...,1


In [12]:
all_data[0:10]

Unnamed: 0,source,sentence,score
0,amazon,So there is no way for me to plug it in here i...,0
1,amazon,"Good case, Excellent value.",1
2,amazon,Great for the jawbone.,1
3,amazon,Tied to charger for conversations lasting more...,0
4,amazon,The mic is great.,1
5,amazon,I have to jiggle the plug to get it to line up...,0
6,amazon,If you have several dozen or several hundred c...,0
7,amazon,If you are Razr owner...you must have this!,1
8,amazon,"Needless to say, I wasted my money.",0
9,amazon,What a waste of money and time!.,0


In [13]:
#search yelp data
all_data[all_data['source'] =="yelp"][:5]

Unnamed: 0,source,sentence,score
2000,yelp,Wow... Loved this place.,1
2001,yelp,Crust is not good.,0
2002,yelp,Not tasty and the texture was just nasty.,0
2003,yelp,Stopped by during the late May bank holiday of...,1
2004,yelp,The selection on the menu was great and so wer...,1


In [14]:
#search positive sentence
all_data[(all_data['score'] == "1")][:5]

Unnamed: 0,source,sentence,score
1,amazon,"Good case, Excellent value.",1
2,amazon,Great for the jawbone.,1
4,amazon,The mic is great.,1
7,amazon,If you are Razr owner...you must have this!,1
10,amazon,And the sound quality is great.,1


# 3. Check missing value

In [15]:
all_data.isnull().apply(lambda x: dmh.check_missing_values(x))

source      (The amoung of missing records is: , 0)
sentence    (The amoung of missing records is: , 0)
score       (The amoung of missing records is: , 0)
dtype: object

### 3.1 Insert missing values

In [16]:
fack_series= pd.Series(pd.np.nan, index=categories)
fack_data = all_data.append(fack_series, ignore_index=True)

In [17]:
len(fack_data)

3001

In [18]:
fack_data.isnull().apply(lambda x: dmh.check_missing_values(x))

source      (The amoung of missing records is: , 1)
sentence    (The amoung of missing records is: , 1)
score       (The amoung of missing records is: , 1)
dtype: object

In [19]:
fack_data.dropna(inplace=True)
fack_data.isnull().apply(lambda x: dmh.check_missing_values(x))

source      (The amoung of missing records is: , 0)
sentence    (The amoung of missing records is: , 0)
score       (The amoung of missing records is: , 0)
dtype: object

### 3.2 Dealing with Duplicate Data

In [20]:
sum(all_data["sentence"].duplicated())

17

In [21]:
all_data = all_data.drop_duplicates(['sentence'])

In [22]:
len(all_data)

2983

# 4. Data Processing

### 4.1 Sampling

In [23]:
data_sample = all_data.sample(n=1000)
len(data_sample)

1000

In [24]:
data_sample[0:4]

Unnamed: 0,source,sentence,score
563,amazon,If you are looking for a good quality Motorola...,0
1642,imdb,There's barely a boring moment in the film and...,1
1260,imdb,I came out of it feeling angry.,0
1309,imdb,But when someone strives for greatness and poe...,0


In [25]:
all_category_counts = ta.get_tokens_and_frequency(list(all_data.score))
Sample_category_counts = ta.get_tokens_and_frequency(list(data_sample.score))

In [26]:
py.iplot(ta.plot_word_frequency(all_category_counts, "Score distribution"))

In [27]:
py.iplot(ta.plot_word_frequency(Sample_category_counts, "Score distribution"))


### 4.2 Feature Creation

In [28]:
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '...', "'s",'/','-','&','``','--',"''",""])

all_data['unigrams'] = all_data['sentence'].apply(lambda x: list(set(word_tokenize(x))-stop_words))

In [29]:
all_data[0:4]['unigrams']

0        [converter, unless, way, I, plug, go, So, US]
1                       [Good, value, case, Excellent]
2                                     [Great, jawbone]
3    [PROBLEMS, Tied, conversations, lasting, 45, c...
Name: unigrams, dtype: object

In [30]:
list(all_data[0:3]['unigrams'])

[['converter', 'unless', 'way', 'I', 'plug', 'go', 'So', 'US'],
 ['Good', 'value', 'case', 'Excellent'],
 ['Great', 'jawbone']]

### 4.3 Feature subset selection

In [31]:
count_vect = CountVectorizer(stop_words='english')
X_counts = count_vect.fit_transform(all_data.sentence)

In [32]:
analyze = count_vect.build_analyzer()
analyze(" ".join(list(all_data[0:1].sentence)))

['way', 'plug', 'unless', 'converter']

In [33]:
" ".join(list(all_data[0:1].sentence))

'So there is no way for me to plug it in here in the US unless I go by a converter.'

In [34]:
X_counts.shape

(2983, 4896)

In [35]:
count_vect.get_feature_names()[0:20]

['00',
 '10',
 '100',
 '11',
 '12',
 '13',
 '15',
 '15g',
 '15pm',
 '17',
 '18',
 '18th',
 '1928',
 '1947',
 '1948',
 '1949',
 '1971',
 '1973',
 '1979',
 '1980']

In [36]:
all_data[0:5]

Unnamed: 0,source,sentence,score,unigrams
0,amazon,So there is no way for me to plug it in here i...,0,"[converter, unless, way, I, plug, go, So, US]"
1,amazon,"Good case, Excellent value.",1,"[Good, value, case, Excellent]"
2,amazon,Great for the jawbone.,1,"[Great, jawbone]"
3,amazon,Tied to charger for conversations lasting more...,0,"[PROBLEMS, Tied, conversations, lasting, 45, c..."
4,amazon,The mic is great.,1,"[The, great, mic]"


In [37]:
X_counts[0:5,0:20].toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [38]:
count_vect.transform(['Something completely new.']).toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [39]:
count_vect.transform(['00 Something completely new.']).toarray()

array([[1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [40]:
plot_x = ["term_"+i for i in count_vect.get_feature_names()[0:50]]

In [41]:
print(plot_x)

['term_00', 'term_10', 'term_100', 'term_11', 'term_12', 'term_13', 'term_15', 'term_15g', 'term_15pm', 'term_17', 'term_18', 'term_18th', 'term_1928', 'term_1947', 'term_1948', 'term_1949', 'term_1971', 'term_1973', 'term_1979', 'term_1980', 'term_1986', 'term_1995', 'term_1998', 'term_20', 'term_2000', 'term_2005', 'term_2006', 'term_2007', 'term_20th', 'term_2160', 'term_23', 'term_24', 'term_25', 'term_2mp', 'term_30', 'term_30s', 'term_325', 'term_35', 'term_350', 'term_375', 'term_3o', 'term_40', 'term_40min', 'term_42', 'term_44', 'term_45', 'term_4s', 'term_4ths', 'term_50', 'term_5020']


In [42]:
plot_y = ["doc_"+ str(i) for i in (all_data.index)[0:50]]

In [43]:
plot_z = X_counts[0:50, 0:50].toarray()

In [44]:
py.iplot(ta.plot_heat_map(plot_x, plot_y, plot_z))

### 4.4 Dimensionality Reduction

In [45]:
from sklearn.decomposition import PCA

In [46]:
X_reduced = PCA(n_components=3).fit_transform(X_counts.toarray())

In [47]:
X_reduced.shape

(2983, 3)

In [48]:
trace1 = ta.get_trace(X_reduced, all_data["score"], "0", "rgb(71,233,163)")
trace2 = ta.get_trace(X_reduced, all_data["score"], "1", "rgb(229,65,136)")


in the future, boolean array-likes will be handled as a boolean array index


in the future, boolean array-likes will be handled as a boolean array index


in the future, boolean array-likes will be handled as a boolean array index



In [49]:
data = [trace1, trace2]

In [50]:
layout = go.Layout(
    margin=dict(
        l=0,
        r=0
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-3d-scatter')


### 4.5 Atrribute Transformation / Aggregation

In [51]:
term_frequencies = []
for j in range(0,X_counts.shape[1]):
    term_frequencies.append(sum(X_counts[:,j].toarray()))

In [52]:
term_frequencies[0]

array([1], dtype=int64)

In [53]:
py.iplot(ta.plot_word_frequency([count_vect.get_feature_names(), term_frequencies], "Term Frequency Distribution"))

In [54]:
term_frequencies_log = [math.log(i) for i in term_frequencies]

In [55]:
py.iplot(ta.plot_word_frequency([count_vect.get_feature_names(), term_frequencies_log], "Term Frequency Distribution"))

### 4.6 Discretization and Binarization

In [56]:
from sklearn import preprocessing, metrics, decomposition, pipeline, dummy
mlb = preprocessing.LabelBinarizer()
mlb.fit(all_data.score)


LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [57]:
mlb.classes_
all_data['bin_category'] = mlb.transform(all_data['score']).tolist()

In [58]:
all_data[0:9]

Unnamed: 0,source,sentence,score,unigrams,bin_category
0,amazon,So there is no way for me to plug it in here i...,0,"[converter, unless, way, I, plug, go, So, US]",[0]
1,amazon,"Good case, Excellent value.",1,"[Good, value, case, Excellent]",[1]
2,amazon,Great for the jawbone.,1,"[Great, jawbone]",[1]
3,amazon,Tied to charger for conversations lasting more...,0,"[PROBLEMS, Tied, conversations, lasting, 45, c...",[0]
4,amazon,The mic is great.,1,"[The, great, mic]",[1]
5,amazon,I have to jiggle the plug to get it to line up...,0,"[get, decent, I, plug, right, line, volume, ji...",[0]
6,amazon,If you have several dozen or several hundred c...,0,"[sending, hundred, If, contacts, several, fun,...",[0]
7,amazon,If you are Razr owner...you must have this!,1,"[must, owner, Razr, If]",[1]
8,amazon,"Needless to say, I wasted my money.",0,"[I, wasted, say, Needless, money]",[0]
