## <font color='blue'>This Jupyter Notebook processes text, builds corpus, and explores the most common features </font>

### Import necessary packages and helper functions

In [1]:
from cleaner_funcs import clean, clean_text_string, clean_list
import collections
import csv
from helpers import add_stopwords, load_csv, save_to_csv
from NLP_functions import add_stopwords, get_top_keywords, top_features
import json
from NLPPipe import NLPPipe
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer, TreebankWordTokenizer
from nltk.util import ngrams
import pandas as pd
import pickle
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
import os

### Setting path to data files and checking current working directory

In [2]:
data_path = 'C:\\Users\\Meehir\\Documents\\GitHub\\project-4-public\\data\\'
os.getcwd()

'C:\\Users\\Meehir\\Documents\\GitHub\\project-4-public\\notebooks'

### Reading in comment data from JSON

In [3]:
with open(data_path + 'json_files\\combined_2000.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

### Making a list of comment content strings

In [4]:
comms = []

for thread in data:
    for comment in thread['comments']:
        comms.append(comment['comment'])

### Checking length (i.e. number of comments scraped)

In [5]:
len(comms)

16451

### Copying to variable <font color='blue'>*corpus*</font>

In [6]:
corpus = comms

### Load stop words from package 

In [7]:
my_stop_words = list(text.ENGLISH_STOP_WORDS.union(["book"]))

### Make nlp object
    1. fit model to corpus
    2. transform model 
    3. convert to dense array

In [8]:
nlp = NLPPipe(vectorizer=TfidfVectorizer(stop_words=set(my_stop_words),max_features=15000), 
              cleaning_function=clean, 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())

nlp.fit(corpus)
nlp.transform(corpus).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [9]:
print(top_features(nlp, 0, 100))

['penn', 'school', 'thi', 'appli', 'year', 'student', 'wa', 'like', 'think', 'just', 'applic', 'know', 'colleg', 'good', 'don', 'accept', 'ani', 'wharton', 'class', 'want', 'realli', 'veri', 'ha', 'ed', 'upenn', 'onli', 'major', 'gpa', 'thank', 'program', 'score', 'sat', 'admiss', 'chanc', 'work', 'time', 'make', 'say', 'got', 'becaus', 'look', 'did', 'peopl', 'help', 'essay', 'decis', 'high', 'mani', 'math', 'doe', 'sure', 'need', 'cours', 'ap', 'whi', 'best', 'lot', 'act', 'thing', 'test', 'hope', 'experi', 'state', 'reject', 'univers', 'busi', 'differ', 'gener', 'great', 'scienc', 'anyon', 'subject', 'competit', 'financi', 'probabl', 'commun', 'recommend', 'hi', 'tri', 'aid', 'grade', 'club', 'stat', 'interview', 'rank', 'senior', 'sinc', 'intern', 'ec', 'place', 've', 'way', 'transfer', 'rate', 'said', 'luck', 'summer', 'll', 'write', 'app']


### Adding to stop words from top 100 terms...

In [10]:
words_to_del = ['ve', 'thi', 'wa', 'like', 'year','just', 'know', 'good', 'don', 'ani', 'want', 'realli', 'veri',
                     'ha', 'thank','say', 'got', 'becaus', 'look', 'make', 'time', 'ha', 'did', 'peopl', 'doe', 'sure',
                     'need', 'whi', 'lot', 'thing', 'state', 'gener', 'great', 'anyon', 'probabl', 'hi', 'tri', 'sinc', 'ec',
                     'way', 'said', 'll', 'write']

my_stop_words = add_stopwords(my_stop_words, words_to_del)

### Rerunning nlp model to account for new stop words

In [11]:
nlp = NLPPipe(vectorizer=TfidfVectorizer(stop_words=set(my_stop_words),max_features=15000), 
              cleaning_function=clean, 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())

nlp.fit(corpus)
nlp.transform(corpus).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
print(top_features(nlp, 0, 100))

['penn', 'school', 'appli', 'student', 'think', 'applic', 'colleg', 'accept', 'wharton', 'class', 'ed', 'upenn', 'onli', 'major', 'gpa', 'program', 'score', 'sat', 'admiss', 'chanc', 'work', 'help', 'essay', 'decis', 'high', 'mani', 'math', 'cours', 'ap', 'best', 'act', 'test', 'hope', 'experi', 'reject', 'univers', 'busi', 'differ', 'scienc', 'subject', 'competit', 'financi', 'commun', 'recommend', 'aid', 'grade', 'club', 'stat', 'interview', 'rank', 'senior', 'intern', 'place', 'transfer', 'rate', 'summer', 'luck', 'app', 'better', 'didn', 'everyon', 'pretti', 'come', 'question', 'read', 'ye', 'award', 'admit', 'ca', 'mean', 'research', 'ask', 'definit', 'extracurricular', 'offer', 'consid', 'incom', 'els', 'use', 'ii', 'doesn', 'ivi', 'attend', 'person', 'http', 'academ', 'submit', 'feel', 'volunt', 'post', 'job', 'kid', 'studi', 'day', 'receiv', 'talk', 'engin', 'number', 'email', 'start']


In [None]:
nlp = NLPPipe(vectorizer=TfidfVectorizer(stop_words=set(my_stop_words),max_features=15000), 
              cleaning_function=clean, 
              tokenizer=TreebankWordTokenizer().tokenize, 
              stemmer=PorterStemmer())

nlp.fit(corpus)
nlp.transform(corpus).toarray()

In [None]:
print(top_features(nlp, 100,250))

In [None]:
words_to_del = ['high', 'place', 'didn', 'everyon', 'pretti', 'come', 'ye', 'ca', 'ask', 'els', 'use', 'ii', 'doesn',
                'ivi', 'person', 'http', 'kid', 'day', 'talk', 'number', 'start']

my_stop_words = add_stopwords(my_stop_words, words_to_del)

In [None]:
words_to_del = ['didn', 'everyon', 'come', 'read', 'ye', 'els','anoth', 'anoth', 'end', 'rd', 'someth', 'mayb', 'anyth', 'possibl', 'thread', 'bc', 'wonder', 'alreadi',
                'took', 'use', 'doesn', 'ii', 'ivi', 'http', 'howev', 'right', 'send', 'addit', 'befor', 'choic', 'anoth', 'end', 'someth', 'earli', 'bc', 'alreadi', 
                'hour', 'www', 'everi', 'abl', 'went', 'let', 'load', 'heard', 'tell', 'live', 'sea', 'big', 'pleas', 'anyth', 'guy', 'took', 'www', 'everi', 'went', 'abl', 'weight',
                'includ', 'big', 'someon', 'bit', 'urm','someon', 'chang', 'inform', 'bit', 'urm', 'guess', 'mention', 'taken', 'dure', 'believ', 'com', 'especi',
                'isn', 'abov', 'lol', 'compar', 'wrote', 'agre', 'object', 'non', 'noth', 'alway', 'edu', 'hey', 'sent']

In [None]:
my_stop_words = add_stopwords(my_stop_words, words_to_del)

In [None]:
print(top_features(nlp, 100, 250))

In [None]:
words_to_del = ['ab', 'kind', 'given', 'overal', 'happen', 'singl', 'pre']

In [None]:
my_stop_words = add_stopwords(my_stop_words, words_to_del)

In [None]:
type(my_stop_words)

In [None]:
save_to_csv(my_stop_words, data_path + "\\csv_files\\",'my_stop_words.csv')