In [68]:
import glob
import os
import re
import pandas as pd
import numpy as np
from ast import literal_eval

In [69]:
print os.getcwd()
os.chdir("/Users/palermospenano/Desktop/github_proj/ejmr_analysis/ejmr_data")

/Users/palermospenano/Desktop/github_proj/ejmr_analysis/ejmr_data


In [70]:
# helps sort file by number at the end of each filename (10 correctly placed after 9 in filename)
numbers = re.compile(r'(\d+)')
def numericalSort(value):
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [71]:
# concatenate all csv files
ejmr_data = sorted(glob.glob('ejmr_pg*.csv'), key=numericalSort)

df_list = []
for filename in sorted(ejmr_data):
    df_list.append(pd.read_csv(filename))
full_df = pd.concat(df_list)

In [72]:
# split title into orig. link and post title, and get freshness values
full_df['post_link'] = full_df['title'].str.extract(r'<a href="(.*?)">',expand=True)
full_df['post_title'] = full_df['title'].str.extract(r'">(.*?)</a>',expand=True)
full_df['how_fresh'] = full_df['freshness'].str.extract(r'">(.*?)</a>',expand=True)

list(full_df.columns)

# reorder columns
full_df = full_df[['post_title',
                   'num_posts',
                   'views',
                   'votes',
                   'how_fresh',
                   'download_time',
                   'title',
                   'post_link',
                   'freshness'
                  ]]

# export full data to csv
full_df.to_csv('ejmr_data.csv', index=True, index_label='pandas_index')

In [73]:
full_df.head()

Unnamed: 0,post_title,num_posts,views,votes,how_fresh,download_time,title,post_link,freshness
0,Pls explain me how machines can learn?,5,95,0-0,1 day,2017-06-11 03:37:22.777761,"<a href=""https://www.econjobrumors.com/topic/p...",https://www.econjobrumors.com/topic/pls-explai...,"<a href=""https://www.econjobrumors.com/topic/p..."
1,Machine learning is Satan worshipping in disguise,17,337,6-0,1 day,2017-06-11 03:37:22.777852,"<a href=""https://www.econjobrumors.com/topic/m...",https://www.econjobrumors.com/topic/machine-le...,"<a href=""https://www.econjobrumors.com/topic/m..."
2,Should I buy a BMW,29,960,0-0,1 day,2017-06-11 03:37:22.777956,"<a href=""https://www.econjobrumors.com/topic/s...",https://www.econjobrumors.com/topic/should-i-b...,"<a href=""https://www.econjobrumors.com/topic/s..."
3,A proposal on how to replace student evaluations,22,561,4-3,1 day,2017-06-11 03:37:22.778058,"<a href=""https://www.econjobrumors.com/topic/a...",https://www.econjobrumors.com/topic/a-proposal...,"<a href=""https://www.econjobrumors.com/topic/a..."
4,"Roman Catholics, put down your rosaries and ex...",1,69,1-0,1 day,2017-06-11 03:37:22.778133,"<a href=""https://www.econjobrumors.com/topic/r...",https://www.econjobrumors.com/topic/roman-cath...,"<a href=""https://www.econjobrumors.com/topic/r..."


In [74]:
ejmr_df = pd.read_csv('ejmr_data.csv')

In [75]:
def nonNaN_title_only(df, df_col):
    """
    replaces empty strings with NaN and subsequently removes removes them. Function makes changes in place.
    """
    print "Total number of rows: {0}".format(len(df))
    print len(df) - df.count()  # number of NaN rows for each column
    
    df[df_col].replace('', np.nan, inplace=True)
    df.dropna(subset=[df_col], inplace=True)
    
    print "Total number of rows after deleting NaN: {0}".format(len(df))
    
def cleanText(text):
    """
    removes punctuation, stopwords and returns lowercase text in a list of single words
    source: http://francescopochetti.com/tag/sentiment-analysis/
    """
    text = text.lower()    

    from bs4 import BeautifulSoup
    text = BeautifulSoup(text).get_text()

    from nltk.tokenize import RegexpTokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)

    from nltk.corpus import stopwords
    clean = [word for word in text if word not in stopwords.words('english')]

    return ', '.join(clean)

In [76]:
nonNaN_title_only(ejmr_df,'post_title')

Total number of rows: 288501
pandas_index        0
post_title       2385
num_posts           0
views               0
votes               0
how_fresh           0
download_time       0
title               0
post_link        2347
freshness           0
dtype: int64
Total number of rows after deleting NaN: 286116


In [77]:
# clean text (warning: takes a while---about 5 min)
ejmr_df['cleanTitle'] = ejmr_df['post_title'].apply(cleanText)

In [78]:
ejmr_df[['post_title', 'cleanTitle']].head()

Unnamed: 0,post_title,cleanTitle
0,Pls explain me how machines can learn?,"pls, explain, machines, learn"
1,Machine learning is Satan worshipping in disguise,"machine, learning, satan, worshipping, disguise"
2,Should I buy a BMW,"buy, bmw"
3,A proposal on how to replace student evaluations,"proposal, replace, student, evaluations"
4,"Roman Catholics, put down your rosaries and ex...","roman, catholics, put, rosaries, explain"


In [79]:
# save cleaned data to csv
ejmr_df.to_csv('ejmr_data_cleand.csv', index=True, index_label='pandas_index')