In [13]:
import pandas as pd
import numpy as np

In [14]:
import matplotlib.pyplot as plt
%matplotlib inline

In [15]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [16]:
from nltk.corpus import stopwords

In [17]:
from bs4 import BeautifulSoup

In [18]:
import re
import tqdm
import pickle
import os

In [19]:
def _print(statement, arguments, do_print = True):
    
    if do_print:
        print(statement.format(*arguments))
    
    return

In [20]:
def print_lb(character, num = 60):

    print(character*num)
    
    return

# [1] Reading Data

## [1.1] Reading CSV file

In [21]:
project_data = pd.read_csv('./data/train_data.csv')
project_data.head(n = 3)

Unnamed: 0.1,Unnamed: 0,id,teacher_id,teacher_prefix,school_state,project_submitted_datetime,project_grade_category,project_subject_categories,project_subject_subcategories,project_title,project_essay_1,project_essay_2,project_essay_3,project_essay_4,project_resource_summary,teacher_number_of_previously_posted_projects,project_is_approved
0,160221,p253737,c90749f5d961ff158d4b4d1e7dc665fc,Mrs.,IN,2016-12-05 13:43:57,Grades PreK-2,Literacy & Language,"ESL, Literacy",Educational Support for English Learners at Home,My students are English learners that are work...,"\""The limits of your language are the limits o...",,,My students need opportunities to practice beg...,0,0
1,140945,p258326,897464ce9ddc600bced1151f324dd63a,Mr.,FL,2016-10-25 09:22:10,Grades 6-8,"History & Civics, Health & Sports","Civics & Government, Team Sports",Wanted: Projector for Hungry Learners,Our students arrive to our school eager to lea...,The projector we need for our school is very c...,,,My students need a projector to help with view...,7,1
2,21895,p182444,3465aaf82da834c0582ebd0ef8040ca0,Ms.,AZ,2016-08-31 12:03:56,Grades 6-8,Health & Sports,"Health & Wellness, Team Sports",Soccer Equipment for AWESOME Middle School Stu...,"\r\n\""True champions aren't always the ones th...",The students on the campus come to school know...,,,"My students need shine guards, athletic socks,...",1,0


## [1.2] Colmn Names

In [22]:
_print("Project DataFrame columns information: \n\n{}", [project_data.columns.values])

Project DataFrame columns information: 

['Unnamed: 0' 'id' 'teacher_id' 'teacher_prefix' 'school_state'
 'project_submitted_datetime' 'project_grade_category'
 'project_subject_categories' 'project_subject_subcategories'
 'project_title' 'project_essay_1' 'project_essay_2' 'project_essay_3'
 'project_essay_4' 'project_resource_summary'
 'teacher_number_of_previously_posted_projects' 'project_is_approved']


# [2] Data Preprocessing

## [2.1] Preprocessing Categorical features: project_grade_category

> Before preprocessing `project_grade_category` value_counts

In [23]:
project_data['project_grade_category'].value_counts()

Grades PreK-2    44225
Grades 3-5       37137
Grades 6-8       16923
Grades 9-12      10963
Name: project_grade_category, dtype: int64

In [24]:
project_data['project_grade_category'] = project_data['project_grade_category'].map(
    lambda x: x.replace(' ', '_')
                .replace('-', '_')
                .lower())

> After preprocessing `project_grade_category` value_counts

In [25]:
project_data['project_grade_category'].value_counts()

grades_prek_2    44225
grades_3_5       37137
grades_6_8       16923
grades_9_12      10963
Name: project_grade_category, dtype: int64

In [26]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)

    return phrase

In [27]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"]

In [28]:
def preprocessing_text(raw_txt):
    
    cleaned_txt = ""
    
    cleaned_txt = re.sub('https\S+|http\S+', "", raw_txt)
    
    cleaned_txt = decontracted(cleaned_txt)
    
    cleaned_txt = BeautifulSoup(cleaned_txt, "lxml").get_text()
    
    cleaned_txt = re.sub("\S*\d\S*", "", cleaned_txt)
    cleaned_txt = re.sub("[^A-Za-z0-9]+", " ", cleaned_txt)
    
    cleaned_txt = cleaned_txt.replace('\\r', ' ')
    cleaned_txt = cleaned_txt.replace('\\n', ' ')
    cleaned_txt = cleaned_txt.replace('\\"', ' ')
    
    cleaned_txt = " ".join(word.lower().strip() for word in cleaned_txt.split() if word.lower() not in stopwords) 
    
    return cleaned_txt

In [42]:
project_data['project_title'] = project_data['project_title'].map(preprocessing_text)

In [43]:
def show_series_particular_val(series, idxs = [1, 10, 100, 1000]):

    for idx in idxs:

        _print("{}", [series[idx]])

    return

In [44]:
show_series_particular_val(project_data['project_title'])

wanted projector hungry learners
reading changes lives
century learners century technology
sailing super grade year
