In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import SnowballStemmer

import nltk
nltk.download('stopwords')

In [None]:
stops = set(stopwords.words("english"))
print(stops)

alternative_stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']

In [None]:
# Parameters:
# hyper_cleaning: If true, do a second round, more through data cleaning
# remove_punctuation: If true, remove all punctuations in the text
# remove_stopwords: If true, remove all stopwords (the list of stop words is shown above)
# use_alternative_stopwords: If true, remove all stopwords in a smaller set of stopwords
# stem_words: If true, shorten words to their stems

def data_cleaning(train, test, hyper_cleaning = True, remove_punctuation = True, remove_stopwords = True, use_alternative_stopwords = True, stem_words = True):
    train_copy = train
    test_copy = test
    
    train_question1 = train['question1']
    train_question2 = train['question2']
    test_question1 = test['question1']
    test_question2 = test['question2']
    
    train_question1_cleaned = []
    train_question2_cleaned = []
    test_question1_cleaned = []
    test_question2_cleaned = []
    
    for text in train_question1:
        cleaned_text = text_to_wordlist(str(text), hyper_cleaning, remove_punctuation, remove_stopwords, use_alternative_stopwords, stem_words)
        train_question1_cleaned.append(cleaned_text)
    for text in train_question2:
        cleaned_text = text_to_wordlist(str(text), hyper_cleaning, remove_punctuation, remove_stopwords, use_alternative_stopwords, stem_words)
        train_question2_cleaned.append(cleaned_text)
    for text in test_question1:
        cleaned_text = text_to_wordlist(str(text), hyper_cleaning, remove_punctuation, remove_stopwords, use_alternative_stopwords, stem_words)
        test_question1_cleaned.append(cleaned_text)
    for text in test_question2:
        cleaned_text = text_to_wordlist(str(text), hyper_cleaning, remove_punctuation, remove_stopwords, use_alternative_stopwords, stem_words)
        test_question2_cleaned.append(cleaned_text)
    
    train_copy['question1'] = train_question1_cleaned
    train_copy['question2'] = train_question2_cleaned
    test_copy['question1'] = test_question1_cleaned
    test_copy['question2'] = test_question2_cleaned
    
    train_file_name = "train (cleaned)"
    test_file_name = "test (cleaned)"
    
    if hyper_cleaning:
        train_file_name = train_file_name + "(hyper_cleaned)"
        test_file_name = test_file_name + "(hyper_cleaned)"
    if remove_punctuation:
        train_file_name = train_file_name + "(punctuation_removed)"
        test_file_name = test_file_name + "(punctuation_removed)"
    if remove_stopwords:
        train_file_name = train_file_name + "(stopwords_removed)"
        test_file_name = test_file_name + "(stopwords_removed)"
    if use_alternative_stopwords:
        train_file_name = train_file_name + "(alternative_stopwords_used)"
        test_file_name = test_file_name + "(alternative_stopwords_used)"
    if stem_words:
        train_file_name = train_file_name + "(words_shortened)"
        test_file_name = test_file_name + "(words_shortened)"
    
    train_copy.to_csv(train_file_name + '.csv', index=False)
    test_copy.to_csv(test_file_name + '.csv', index=False)

In [None]:
def text_to_wordlist(text, hyper_cleaning, remove_punctuation, remove_stopwords, use_alternative_stopwords, stem_words):
    
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        if use_alternative_stopwords:
            stops = ['the','a','an','and','but','if','or','because','as','what','which','this','that','these','those','then',
              'just','so','than','such','both','through','about','for','is','of','while','during','to','What','Which',
              'Is','If','While','This']
        else:
            stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]

    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    if hyper_cleaning:
        # Clean the text (second round)
        text = re.sub(r"[^A-Za-z0-9]", " ", text)
        text = re.sub(r"what's", "", text)
        text = re.sub(r"What's", "", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"I'm", "I am", text)
        text = re.sub(r" m ", " am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r"60k", " 60000 ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e-mail", "email", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"quikly", "quickly", text)
        text = re.sub(r" usa ", " America ", text)
        text = re.sub(r" USA ", " America ", text)
        text = re.sub(r" u s ", " America ", text)
        text = re.sub(r" uk ", " England ", text)
        text = re.sub(r" UK ", " England ", text)
        text = re.sub(r"india", "India", text)
        text = re.sub(r"switzerland", "Switzerland", text)
        text = re.sub(r"china", "China", text)
        text = re.sub(r"chinese", "Chinese", text) 
        text = re.sub(r"imrovement", "improvement", text)
        text = re.sub(r"intially", "initially", text)
        text = re.sub(r"quora", "Quora", text)
        text = re.sub(r" dms ", "direct messages ", text)  
        text = re.sub(r"demonitization", "demonetization", text) 
        text = re.sub(r"actived", "active", text)
        text = re.sub(r"kms", " kilometers ", text)
        text = re.sub(r"KMs", " kilometers ", text)
        text = re.sub(r" cs ", " computer science ", text) 
        text = re.sub(r" upvotes ", " up votes ", text)
        text = re.sub(r" iPhone ", " phone ", text)
        text = re.sub(r"\0rs ", " rs ", text) 
        text = re.sub(r"calender", "calendar", text)
        text = re.sub(r"ios", "operating system", text)
        text = re.sub(r"gps", "GPS", text)
        text = re.sub(r"gst", "GST", text)
        text = re.sub(r"programing", "programming", text)
        text = re.sub(r"bestfriend", "best friend", text)
        text = re.sub(r"dna", "DNA", text)
        text = re.sub(r"III", "3", text) 
        text = re.sub(r"the US", "America", text)
        text = re.sub(r"Astrology", "astrology", text)
        text = re.sub(r"Method", "method", text)
        text = re.sub(r"Find", "find", text) 
        text = re.sub(r"banglore", "Banglore", text)
        text = re.sub(r" J K ", " JK ", text)

    if remove_punctuation:
        # Remove punctuation from text
        text = ''.join([c for c in text if c not in punctuation])

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)

    # Return a list of words
    return(text)

In [None]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

data_cleaning(train, test)