In [1]:
# necessary for when working with external scripts
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer
import plotly.plotly as py
import plotly.graph_objs as go
import math
import plotly.tools as tls

%matplotlib inline

# my functions
import helpers.data_mining_helpers as dmh
import helpers.text_analysis as ta
import plotly 
plotly.tools.set_credentials_file(username='nirajd', api_key='MHq9Nr2SH3A89fC1k7KM')


In [3]:
# Path of project files
from os import path
notebook_path = '/home/niraj/Data Mining Lab/assign_1'

In [4]:
#  reading the csv file into the data frame
ds = pd.read_csv(path.join(notebook_path, 'sentences', 'dataset.csv'),
                 dtype=object,header= 0, sep='|', engine='python')
ds['Score'] = ds['Score'].astype(str).astype(int)
ds['Source'] = ds['Source'].astype(str)


In [5]:
# checking the data types in the clomns
ds.dtypes

Source      object
Sentence    object
Score        int64
dtype: object

In [6]:
len(ds)

3000

In [7]:
# checking column headers of the dataframe
list(ds)

['Source', 'Sentence', 'Score']

In [8]:
# query for the first 15 rows 
ds[0:15]

Unnamed: 0,Source,Sentence,Score
0,amazon_cells,So there is no way for me to plug it in here i...,0
1,amazon_cells,"Good case, Excellent value.",1
2,amazon_cells,Great for the jawbone.,1
3,amazon_cells,Tied to charger for conversations lasting more...,0
4,amazon_cells,The mic is great.,1
5,amazon_cells,I have to jiggle the plug to get it to line up...,0
6,amazon_cells,If you have several dozen or several hundred c...,0
7,amazon_cells,If you are Razr owner...you must have this!,1
8,amazon_cells,"Needless to say, I wasted my money.",0
9,amazon_cells,What a waste of money and time!.,0


In [9]:
# sampling any 15 rows
ds.sample(n=15, replace=False)

Unnamed: 0,Source,Sentence,Score
1811,imdb,This film offers many delights and surprises.,1
965,amazon_cells,I cannot make calls at certain places.,0
440,amazon_cells,Very Displeased.,0
106,amazon_cells,That's a huge design flaw (unless I'm not usin...,0
2375,yelp,Best tacos in town by far!!,1
1106,imdb,The only thing really worth watching was the s...,1
930,amazon_cells,Never got it!!!!!,0
2352,yelp,Everyone is treated equally special.,1
2503,yelp,The ambiance was incredible.,1
70,amazon_cells,Mic Doesn't work.,0


In [10]:
# checking if the data contains any missing values
ds.isnull()

Unnamed: 0,Source,Sentence,Score
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False


In [11]:
# checking for missing values in every column
ds.isnull().apply(lambda x: dmh.check_missing_values(x))

Source      (The amoung of missing records is: , 0)
Sentence    (The amoung of missing records is: , 0)
Score       (The amoung of missing records is: , 0)
dtype: object

In [12]:
# defining the entry data with missing data. This data will not have a score
dummy_series = pd.Series(["imdb", "I loved it"], index=["Source", "Sentence"])

In [13]:
dummy_series

Source            imdb
Sentence    I loved it
dtype: object

In [14]:
# insert the data to the dataframe
result_with_series = ds.append(dummy_series, ignore_index=True)

In [15]:
# checking the length of the data to see if the new entry has been added
len(result_with_series)

3001

In [16]:
result_with_series.isnull().apply(lambda x: dmh.check_missing_values(x))

Source      (The amoung of missing records is: , 0)
Sentence    (The amoung of missing records is: , 0)
Score       (The amoung of missing records is: , 1)
dtype: object

In [17]:
# entry with missing data, this one has no sentiment
dummy_dict = [{'Source': 'imdb',
               'Score': 1
              }]

In [18]:
ds = ds.append(dummy_dict,ignore_index=True)

In [19]:
len(ds)

3001

In [20]:
ds.isnull().apply(lambda x: dmh.check_missing_values(x))

Source      (The amoung of missing records is: , 0)
Sentence    (The amoung of missing records is: , 1)
Score       (The amoung of missing records is: , 0)
dtype: object

In [21]:
pd.isnull(ds)

Unnamed: 0,Source,Sentence,Score
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False


In [22]:
# displaying the entry that has missing data
ds[ds.isnull().any(axis=1)]

Unnamed: 0,Source,Sentence,Score
3000,imdb,,1


In [23]:
ds.dropna(inplace=True)

In [24]:
ds.isnull().apply(lambda x: dmh.check_missing_values(x))

Source      (The amoung of missing records is: , 0)
Sentence    (The amoung of missing records is: , 0)
Score       (The amoung of missing records is: , 0)
dtype: object

Done with missing data, time for dealing with duplicate data. Duplicates can be allowed as they might help us in finding some patten.

In [25]:
sum(ds.duplicated())

17

In [26]:
ds[ds.duplicated(['Source', 'Sentence' , 'Score'])]

Unnamed: 0,Source,Sentence,Score
285,amazon_cells,Great phone!.,1
407,amazon_cells,Works great.,1
524,amazon_cells,Works great!.,1
543,amazon_cells,Don't buy this product.,0
744,amazon_cells,If you like a loud buzzing to override all you...,0
748,amazon_cells,Does not fit.,0
778,amazon_cells,This is a great deal.,1
792,amazon_cells,Great Phone.,1
892,amazon_cells,Excellent product for the price.,1
896,amazon_cells,Great phone.,1


In [27]:
ds_sample = ds.sample(n=600)

In [28]:
len(ds_sample)

600

In [29]:
ds_sample[0:15]

Unnamed: 0,Source,Sentence,Score
474,amazon_cells,The delivery was on time.,1
1853,imdb,Everything stinks.,0
558,amazon_cells,Virgin Wireless rocks and so does this cheap l...,1
2742,yelp,I'm not sure how long we stood there but it wa...,0
1681,imdb,"The film gives meaning to the phrase, ""Never i...",1
1346,imdb,The kids are very cool too.,1
1046,imdb,The story itself is just predictable and lazy.,0
2305,yelp,This greedy corporation will NEVER see another...,0
1310,imdb,To those who find this movie intelligent or ev...,0
1348,imdb,It's just lame.,0


In [30]:
ds.columns[0]

'Source'

In [31]:
ds_category_counts = ta.get_tokens_and_frequency(list(ds.Source))
ds_sample_category_counts = ta.get_tokens_and_frequency(list(ds_sample.Source))

In [32]:
py.iplot(ta.plot_word_frequency(ds_category_counts, "Category distribution"))

In [33]:
py.iplot(ta.plot_word_frequency(ds_sample_category_counts, "Category distribution_Sample"))

In [36]:
series = ds['Source'].value_counts()
series_sample = ds_sample['Source'].value_counts()

compare_data = {'Data': ['Orignal', 'Sample'], 
        'imdb': [series[0], series_sample[0]], 
        'yelp': [series[1], series_sample[1]],
         'amazon': [series[2], series_sample[2]]}
ds_sample_compare = pd.DataFrame(compare_data, columns = ['Data','imdb', 'yelp', 'amazon'])
ds_sample_compare.set_index('Data', inplace=True)
ds_sample_compare

Unnamed: 0_level_0,imdb,yelp,amazon
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Orignal,1000,1000,1000
Sample,205,203,192


NameError: name 'cf' is not defined