# ML-for-Good-Hackathon
# Team Name: Vcare
# Participants: Sanjit Mehta, Naveena Chandwani, Rohith Rathod

### Import Common packages

In [1]:
import numpy as np
import pandas as pd
import re
import string
import math
import glob

### Import NLP related packages

In [2]:
#pip install contractions
import contractions
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

import multiprocessing

#pip install gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

from sklearn.cluster import KMeans

### Import Data and Drop duplicates

In [3]:
def read_data():
    path = 'C:/Users/NLP/data/' # use your path
    all_files = glob.glob(path + "/*.csv")

    df_list = []

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        df_list.append(df)

    df = pd.concat(df_list, axis=0, ignore_index=True)

    # Drop duplicates
    cols_req = ["focus_group_subtype", "focus_group_subtype_id", "doc_no_within_subtype", "question_id", 
                "question_text", "parent_num", "parent_answer"]

    df.drop_duplicates(inplace=True)
    df = df[cols_req]
    return df 

In [4]:
input_df = read_data()
input_df.head()

Unnamed: 0,focus_group_subtype,focus_group_subtype_id,doc_no_within_subtype,question_id,question_text,parent_num,parent_answer
0,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",5,Sure. Hi. My name is Parent 5. I have three ki...
1,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",1,"Hi everyone. My name is Parent 1, I have two b..."
2,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",3,Hi everybody. My name is Parent 3 and I have a...
3,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",4,"Hi, I'm Parent 4, I have a 15-year-old daughte..."
4,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",2,"Oh, I'm sorry. I lost connection, I couldn't h..."


# Preprocessing of text Data
1. Expand contraction
2. Case handling
3. Remove punctuations
4. Remove words and digits containing digits
5. Remove stop word
6. Lemmatization
7. Remove Extra Spaces 

#### 1. Expand contraction
Contraction is the shortened form of a word like don’t stands for do not, aren’t stands for are not. Like this, we need to expand this contraction in the text data for better analysis.

In [5]:
def expand_contraction(df,columns=[]):
    
    for col in columns:
        df[col] = df[col].apply(lambda text:contractions.fix(text))
        
    return df

#### 2. Case handling
If the text is in the same case, it is easy for a machine to interpret the words because the lower case and upper case are treated differently by the machine. for example, words like Ball and ball are treated differently by machine. So, we need to make the text in the same case and the most preferred case is a lower case to avoid such problems.

In [6]:
def case_handling(df,columns=[]):
    
    for col in columns:
        df[col] = df[col].str.lower() 
        
    return df       

#### 3. Remove punctuations
One of the other text processing techniques is removing punctuations. there are total 32 main punctuations that need to be taken care of. we can directly use the string module with a regular expression to replace any punctuation in text with an empty string

In [7]:
def remove_punctuations(df,columns=[]):
    
    for col in columns:
        df[col] = df[col].apply(lambda text: re.sub('[%s]' % re.escape(string.punctuation), '' , text))
        df[col] = df[col].apply(lambda text: text.replace("_"," "))
        
    return df   

#### 4. Remove words and digits containing digits
Sometimes it happens that words and digits combine are written in the text which creates a problem for machines to understand. hence, We need to remove the words and digits which are combined like game57 or game5ts7. This type of word is difficult to process so better to remove them or replace them with an empty string. we use regular expressions for this. 

In [8]:
def remove_words_dgits(df,columns=[]):
    
    for col in columns:
        df[col] = df[col].apply(lambda text: re.sub(" \d+",'',text))

    return df

#### 5. Remove stopword
Stopwords are the most commonly occurring words in a text which do not provide any valuable information. stopwords like they, there, this, where, etc are some of the stopwords. NLTK library is a common library that is used to remove stopwords and include approximately 180 stopwords which it removes. If we want to add any new word to a set of words then it is easy using the add method.

In [9]:
def remove_stopwords(df, columns=[]):
    
    stop_words = set(stopwords.words('english'))
    
    def remove_sw(text):
        txt_output = " ".join([word for word in str(text).split() if word not in stop_words])
        return txt_output
    
    for col in columns:
        df[col] = df[col].apply(lambda text: remove_sw(text))
    
    return df

#### 6. Lemmatization
Lemmatization is similar to stemming, used to stem the words into root word but differs in working. Actually, Lemmatization is a systematic way to reduce the words into their lemma by matching them with a language dictionary.

In [10]:
def lemmatize_words(df, columns=[]):
    
    lemmatizer = WordNetLemmatizer()
    
    def lemmatize(text):
        text_output = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
        return text_output
    
    for col in columns:
        df[col] = df[col].apply(lambda text: lemmatize(text))
        
    return df

#### 7. Remove Extra Spaces 
Most of the time text data contain extra spaces or while performing the above preprocessing techniques more than one space is left between the text so we need to control this problem. regular expression library performs well to solve this problem

In [11]:
def remove_extra_spaces(df,columns=[]):
    
    for col in columns:
        df[col] = df[col].apply(lambda text: re.sub(' +', ' ', text))
        
    return df 

In [12]:
def data_preprocessing(df, columns=[]):
    
    df = expand_contraction(df,columns)
    df = case_handling(df,columns) 
    df = remove_punctuations(df,columns)
    df = remove_words_dgits(df,columns)  
    df = remove_stopwords(df,columns) 
    df = lemmatize_words(df, columns)
    df = remove_extra_spaces(df,columns) 
    
    return df

## Data preprocessing function call

In [13]:
# Preprocess the data
columns=['parent_answer']
output_df =  data_preprocessing(input_df, columns)
output_df.head(10)

Unnamed: 0,focus_group_subtype,focus_group_subtype_id,doc_no_within_subtype,question_id,question_text,parent_num,parent_answer
0,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",5,sure hi name parent three kid nine boy good sh...
1,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",1,hi everyone name parent two boy ayearold almos...
2,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",3,hi everybody name parent anyearold daughter an...
3,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",4,hi parent ayearold daughter ayearold son one s...
4,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",2,oh sorry lost connection could hear anyone nam...
5,gaming_group,1,1,1,"So, I was thinking we could start by just goin...",2,crosstalk series yes myyearolds love certain t...
6,gaming_group,1,1,2,We thought that a good way to start might be i...,2,oh okay well use much mean teacher would assig...
7,gaming_group,1,1,2,We thought that a good way to start might be i...,3,would say thing daughter fifth grade would cou...
8,gaming_group,1,1,2,We thought that a good way to start might be i...,5,similar experience parent high schooler lot ho...
9,gaming_group,1,1,2,We thought that a good way to start might be i...,5,weekend hour something like much youtube watch...
