# Phase 1: Data Pre-processing

### Importing libraries

In [None]:
# required imports
import gzip, json, os, pandas as pd, requests, re, unicodedata, nltk

from urllib.request import urlopen
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer
from sklearn import metrics
from collections import Counter
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# mount google drive to read data files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Reading data files

* Note : The data files are shared among team members and Prof. David Goldberg. Add shortcut to this folder in your drive and replace the data files location in the "location" variable below.

In [None]:
location = '/content/drive/MyDrive/Colab Notebooks/MIS798_Files/'        # Parul
# location = '/content/drive/MyDrive/MIS798_SS/'                         # Uma
# location = '/content/drive/MyDrive/MIS798_Files/'                      # Nishu
# location =                                                             # Add your files location here

blenders = 'Blenders.csv'
cooker = 'Slow_cookers.csv'
coffee = 'Coffee_makers.csv'
toaster = 'Toaster_ovens.csv'

In [None]:
# reading csv
df_blender = pd.read_csv(location+blenders)
df_coffee = pd.read_csv(location+coffee)
df_cooker = pd.read_csv(location+cooker)
df_toaster = pd.read_csv(location+toaster)

In [None]:
df_blender.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11970 entries, 0 to 11969
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DataSet Entry ID      11970 non-null  int64 
 1   Text                  11970 non-null  object
 2   Tagger Pid            11970 non-null  object
 3   Date                  11970 non-null  object
 4   Defect                11965 non-null  object
 5   Components Mentioned  11970 non-null  object
 6   Comment               1137 non-null   object
 7   Authority?            11970 non-null  bool  
dtypes: bool(1), int64(1), object(6)
memory usage: 666.4+ KB


In [None]:
# merging data from different files in one dataframe
frames = [df_blender, df_coffee, df_cooker, df_toaster]
df = pd.concat(frames)

In [None]:
df

Unnamed: 0,DataSet Entry ID,Text,Tagger Pid,Date,Defect,Components Mentioned,Comment,Authority?
0,88250594,Excellent Product and a Great Price! These bla...,bowenhc,8/29/17 8:56,No Defect,[Blade or Cutter Assembly],,False
1,54712811,Works as advertised Every morning so far I've ...,bowenhc,8/29/17 8:56,No Defect,[No component mentioned],,False
2,26356181,"A great machine, but...What's wrong? I wish th...",bowenhc,8/29/17 8:56,Performance Defect,"[Motors, Blade or Cutter Assembly]",,False
3,83791618,2 dead do I want another I like this product w...,bowenhc,8/29/17 8:56,Performance Defect,"[Motors, Base]",,False
4,54877071,Magic Bullet...not magic at all. I was very di...,bowenhc,8/29/17 8:56,Performance Defect,[No component mentioned],,False
...,...,...,...,...,...,...,...,...
13858,86160275,It's great in all other ways but it has no tim...,crosales,9/10/17 17:50,Safety Hazard,[Other],no timer,False
13859,53034745,I purchased this Toaster for my Wife's Birthda...,crosales,9/10/17 17:50,No Defect,[No component mentioned],,False
13860,29374583,This toaster oven is much bigger than it looks...,crosales,9/10/17 17:50,No Defect,[No component mentioned],,False
13861,40517377,I purchased this product 6 months ago. It nev...,crosales,9/10/17 17:50,Performance Defect,[Other],never made good toast,False


### Data Filtering
* Working on data labelled as  "No Defect" and "Performance Defect" and removing the data for "Safety Hazard".

In [None]:
# filter out 'Safety Hazard'
df = df[df['Defect']!='Safety Hazard']
df

Unnamed: 0,DataSet Entry ID,Text,Tagger Pid,Date,Defect,Components Mentioned,Comment,Authority?
0,88250594,Excellent Product and a Great Price! These bla...,bowenhc,8/29/17 8:56,No Defect,[Blade or Cutter Assembly],,False
1,54712811,Works as advertised Every morning so far I've ...,bowenhc,8/29/17 8:56,No Defect,[No component mentioned],,False
2,26356181,"A great machine, but...What's wrong? I wish th...",bowenhc,8/29/17 8:56,Performance Defect,"[Motors, Blade or Cutter Assembly]",,False
3,83791618,2 dead do I want another I like this product w...,bowenhc,8/29/17 8:56,Performance Defect,"[Motors, Base]",,False
4,54877071,Magic Bullet...not magic at all. I was very di...,bowenhc,8/29/17 8:56,Performance Defect,[No component mentioned],,False
...,...,...,...,...,...,...,...,...
13857,59606188,"I love my new toaster oven, my old one did not...",crosales,9/10/17 17:49,No Defect,[No component mentioned],,False
13859,53034745,I purchased this Toaster for my Wife's Birthda...,crosales,9/10/17 17:50,No Defect,[No component mentioned],,False
13860,29374583,This toaster oven is much bigger than it looks...,crosales,9/10/17 17:50,No Defect,[No component mentioned],,False
13861,40517377,I purchased this product 6 months ago. It nev...,crosales,9/10/17 17:50,Performance Defect,[Other],never made good toast,False


In [None]:
# Total count for "No Defect" reviews
print('No Defect row Count :', len(df[df['Defect']=='No Defect']))

# Total count for "Performance Defect" reviews
print('Performance Defect row count :', len(df[df['Defect']=='Performance Defect']))

No Defect row Count : 35700
Performance Defect row count : 13995


### Text Pre-processing
* The code cell below contains the function for processing Textual field 'reviews'. Operations performed on the text data are:
 

1.   remove URL, Numbers, non-ASCII characters, punctuation, stopwords, single and double letter words.
2.   convert text to lowercase
3. lemmatize text
4. convert and filter words based on part-of-speech (pos) tag





In [None]:
stop_words = stopwords.words('english')
stop_words = list(set(stop_words))
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def remove_url(dataframe):  
    dataframe['Text'] = dataframe['Text'].str.replace(r's*https?://S+(s+|$)', ' ').str.strip()
    return dataframe


def remove_numbers(text):
    text = re.sub(r'\d+', '', text)
    return text


def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words


def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words


def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', ' ', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words


def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stop_words:
            new_words.append(word)
    return new_words

def get_pos( word ):
    w_synsets = wordnet.synsets(word)

    pos_counts = Counter()
    pos_counts["n"] = len(  [ item for item in w_synsets if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in w_synsets if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in w_synsets if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in w_synsets if item.pos()=="r"]  )
    
    most_common_pos_list = pos_counts.most_common()
    return most_common_pos_list[0][0]
    
def lemmatize_text(text):
    return [lemmatizer.lemmatize( w, get_pos(w) ) for w in text]


def remove_oneandtwo_letter_word(words):
    """Remove one letter word from list of tokenized words"""
    new_words = []
    for word in words.split():
        if len(word) > 2:
            new_words.append(word)
    return ' '.join(list(set(new_words)))



def normalize_data(words):
    words = remove_non_ascii(words)

    words = to_lowercase(words)

    words = remove_punctuation(words)

    words = remove_stopwords(words)

    words = lemmatize_text(words)
    return ' '.join(words)

In [None]:
# remove URL
df = remove_url(df)

  dataframe['Text'] = dataframe['Text'].str.replace(r's*https?://S+(s+|$)', ' ').str.strip()


In [None]:
# Remove Numbers
df['Text'] = df['Text'].apply(lambda x: remove_numbers(x))

In [None]:
# Tokenizing the data
df['Text'] = df.apply(lambda row: nltk.word_tokenize(row['Text']), axis=1)

In [None]:
# Calling the normalize function to implement the remianing Text processing functions
df['Text'] = df.apply(lambda row: normalize_data(row['Text']), axis=1)

In [None]:
# removing one and two letter words
df['Text'] = df.apply(lambda row: remove_oneandtwo_letter_word(row['Text']), axis=1)
df.head()

Unnamed: 0,DataSet Entry ID,Text,Tagger Pid,Date,Defect,Components Mentioned,Comment,Authority?
0,88250594,magic countertop powerful mixer category origi...,bowenhc,8/29/17 8:56,No Defect,[Blade or Cutter Assembly],,False
1,54712811,magic countertop every speed work want peanut ...,bowenhc,8/29/17 8:56,No Defect,[No component mentioned],,False
2,26356181,ingredient wrong work use unit definitive week...,bowenhc,8/29/17 8:56,Performance Defect,"[Motors, Blade or Cutter Assembly]",,False
3,83791618,motor pick similar magic countertop two come t...,bowenhc,8/29/17 8:56,Performance Defect,"[Motors, Base]",,False
4,54877071,magic countertop book amp shaver work follow m...,bowenhc,8/29/17 8:56,Performance Defect,[No component mentioned],,False


### Cleaning the processed dataframe

In [None]:
df.head()

Unnamed: 0,DataSet Entry ID,Text,Tagger Pid,Date,Defect,Components Mentioned,Comment,Authority?
0,88250594,magic countertop powerful mixer category origi...,bowenhc,8/29/17 8:56,No Defect,[Blade or Cutter Assembly],,False
1,54712811,magic countertop every speed work want peanut ...,bowenhc,8/29/17 8:56,No Defect,[No component mentioned],,False
2,26356181,ingredient wrong work use unit definitive week...,bowenhc,8/29/17 8:56,Performance Defect,"[Motors, Blade or Cutter Assembly]",,False
3,83791618,motor pick similar magic countertop two come t...,bowenhc,8/29/17 8:56,Performance Defect,"[Motors, Base]",,False
4,54877071,magic countertop book amp shaver work follow m...,bowenhc,8/29/17 8:56,Performance Defect,[No component mentioned],,False


In [None]:
required_columns = ['Text','Defect','Date']
final_df = df[required_columns]
#final_df = final_df.reset_index(drop=True)

In [None]:
final_df

Unnamed: 0,Text,Defect,Date
0,magic countertop powerful mixer category origi...,No Defect,8/29/17 8:56
1,magic countertop every speed work want peanut ...,No Defect,8/29/17 8:56
2,ingredient wrong work use unit definitive week...,Performance Defect,8/29/17 8:56
3,motor pick similar magic countertop two come t...,Performance Defect,8/29/17 8:56
4,magic countertop book amp shaver work follow m...,Performance Defect,8/29/17 8:56
...,...,...,...
13857,lot burn toaster oven one old love timer toast...,No Defect,9/10/17 17:49
13859,wife she toaster birthday ever purchase best say,No Defect,9/10/17 17:50
13860,put whole pizza toaster oven look amaze contro...,No Defect,9/10/17 17:50
13861,warranty review pay defective toast come send ...,Performance Defect,9/10/17 17:50


In [None]:
# Removing time from Date column
final_df['Date'] = pd.to_datetime(final_df['Date']).dt.date

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['Date'] = pd.to_datetime(final_df['Date']).dt.date


In [None]:
final_df = final_df.dropna()
final_df

Unnamed: 0,Text,Defect,Date
0,magic countertop powerful mixer category origi...,No Defect,2017-08-29
1,magic countertop every speed work want peanut ...,No Defect,2017-08-29
2,ingredient wrong work use unit definitive week...,Performance Defect,2017-08-29
3,motor pick similar magic countertop two come t...,Performance Defect,2017-08-29
4,magic countertop book amp shaver work follow m...,Performance Defect,2017-08-29
...,...,...,...
13857,lot burn toaster oven one old love timer toast...,No Defect,2017-09-10
13859,wife she toaster birthday ever purchase best say,No Defect,2017-09-10
13860,put whole pizza toaster oven look amaze contro...,No Defect,2017-09-10
13861,warranty review pay defective toast come send ...,Performance Defect,2017-09-10


In [None]:
print(final_df["Text"].nunique())
final_df.drop_duplicates(subset=['Text'] ,inplace=True)
final_df.reset_index(inplace = True, drop = True)
final_df

48024


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


Unnamed: 0,Text,Defect,Date
0,magic countertop powerful mixer category origi...,No Defect,2017-08-29
1,magic countertop every speed work want peanut ...,No Defect,2017-08-29
2,ingredient wrong work use unit definitive week...,Performance Defect,2017-08-29
3,motor pick similar magic countertop two come t...,Performance Defect,2017-08-29
4,magic countertop book amp shaver work follow m...,Performance Defect,2017-08-29
...,...,...,...
48019,lot burn toaster oven one old love timer toast...,No Defect,2017-09-10
48020,wife she toaster birthday ever purchase best say,No Defect,2017-09-10
48021,put whole pizza toaster oven look amaze contro...,No Defect,2017-09-10
48022,warranty review pay defective toast come send ...,Performance Defect,2017-09-10


In [None]:
final_df.loc[final_df['Defect'] == "No Defect", 'Defect'] = 0
final_df.loc[final_df['Defect'] == "Performance Defect", 'Defect'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [None]:
final_df

Unnamed: 0,Text,Defect,Date
0,magic countertop powerful mixer category origi...,0,2017-08-29
1,magic countertop every speed work want peanut ...,0,2017-08-29
2,ingredient wrong work use unit definitive week...,1,2017-08-29
3,motor pick similar magic countertop two come t...,1,2017-08-29
4,magic countertop book amp shaver work follow m...,1,2017-08-29
...,...,...,...
48019,lot burn toaster oven one old love timer toast...,0,2017-09-10
48020,wife she toaster birthday ever purchase best say,0,2017-09-10
48021,put whole pizza toaster oven look amaze contro...,0,2017-09-10
48022,warranty review pay defective toast come send ...,1,2017-09-10


### Save the final cleaned data to a csv.
* Update the 'write_location' variable below

In [None]:
write_location = '/content/drive/MyDrive/Colab Notebooks/MIS798_Files/Pre_processed_data/'

In [None]:
final_df.to_csv(write_location+'Processed_data.csv', index=False)
