In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import PorterStemmer
# from nltk.corpus import stopwords
# from nltk.stem.snowball import SnowballStemmer
# from nltk.stem import WordNetLemmatizer 
import spacy

In [5]:
# stop_words_list = stopwords.words('english')
with open("stop_words") as file:
    stop_words_list = [line.strip() for line in file]


# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
spacy_lemmatizer = spacy.load('en', disable=['parser', 'ner'])

file_name_list = ["amazon.csv","apple.csv","google.csv","facebook.csv","uber.csv"]

## Process Common Crawl

In [7]:
for file_name in file_name_list:

    data = pd.read_csv('data_files/cc/'+file_name,names=['content'],header=None)
    print(data.shape)
    with open('processed_data/cc/' + file_name, 'w') as f:
        for index, row in data.iterrows():
            rowdata = row["content"]
            file_clear = re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)", " ", rowdata.lower())
            file_lem = spacy_lemmatizer(file_clear)
            file_lem = " ".join([token.lemma_ for token in file_lem])
            tokens = nltk.word_tokenize(file_lem)
            filtered_words = [word for word in tokens if word not in stop_words_list]
            for item in filtered_words:
                f.write("%s " % item)
            f.write("\n ")
    f.close()

(120, 1)
(120, 1)
(120, 1)
(120, 1)
(120, 1)


## process Tweets

In [43]:
sample_size = 30000

for file_name in file_name_list:
    
    data = pd.read_csv('data_files/twitter/'+file_name)
    data['status_id'] = data['status_id'].str.strip()
    data.drop_duplicates(subset='status_id', keep = 'last', inplace = True)
    print(data.shape)
    dt = data.sample(n = sample_size)
    print(dt.shape)
    print('----------')
    
    with open('processed_data/twitter/' + file_name, 'w') as f:
        for index, row in dt.iterrows():
                rowdata = row["text"]
                file_clear = re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)", " ", rowdata.lower())
                file_lem = spacy_lemmatizer(file_clear)
                file_lem = " ".join([token.lemma_ for token in file_lem])
                tokens = nltk.word_tokenize(file_lem)
                filtered_words = [word for word in tokens if word not in stop_words_list]
                for item in filtered_words:
                    f.write("%s " % item)
                f.write("\n ")
    f.close()
    



(35610, 88)
(30000, 88)
----------
(32804, 88)
(30000, 88)
----------
(35444, 88)
(30000, 88)
----------
(35997, 88)
(30000, 88)
----------
(32273, 88)
(30000, 88)
----------


## Process Nytimes data

In [42]:
sample_size = 300
for file_name in file_name_list:
    article_df = pd.read_csv('data_files/nyt/' + file_name)
    article_df['id'] = article_df['id'].str.strip()
    article_df.drop_duplicates(subset='id', keep = 'last', inplace = True)
    article_df = article_df.dropna()
    # article_df.sort_values("id", inplace = True) 
    print(article_df.shape)
    dt = article_df.sample(n = sample_size)
    print(dt.shape)
    print('----------')
    
    with open('processed_data/nyt/' + file_name, 'w') as f:
        for index, row in dt.iterrows():
                rowdata = row["content"]
                file_clear = re.sub("(@[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)", " ", rowdata.lower())
                file_lem = spacy_lemmatizer(file_clear)
                file_lem = " ".join([token.lemma_ for token in file_lem])
                tokens = nltk.word_tokenize(file_lem)
                filtered_words = [word for word in tokens if word not in stop_words_list]
                for item in filtered_words:
                    f.write("%s " % item)
                f.write("\n ")
    f.close()
    

(950, 3)
(300, 3)
(457, 3)
(300, 3)
(1005, 3)
(300, 3)
(2104, 3)
(300, 3)
(307, 3)
(300, 3)


## Sort and get data

In [9]:
file_name = "amazon"
# data = pd.read_csv('mr_output/nyt/'+file_name, sep='\t',header=None, names=['word','count'])
data = pd.read_csv('mr_co_occ/cc/'+file_name, sep='\t',header=None, names=['word','count'])

In [10]:
data.sort_values(by=['count'], ascending=False).head(20)

Unnamed: 0,word,count
3,amazon-company,106
35,company-amazon,106
62,time-amazon,77
6,amazon-time,77
66,time-company,67
41,company-time,67
0,amazon-business,59
9,business-amazon,59
12,business-company,56
36,company-business,56


## co-occurance json generator 

In [32]:
top_keywords = ["company", "amazon", "time","york","people","trump","city","change","president","business"]

keyword_index = {}

count = 0
for k in top_keywords:
    keyword_index[k] = count
    count +=1

In [33]:
keyword_index

{'company': 0,
 'amazon': 1,
 'time': 2,
 'york': 3,
 'people': 4,
 'trump': 5,
 'city': 6,
 'change': 7,
 'president': 8,
 'business': 9}

In [43]:
file_name = "amazon"

data = pd.read_csv('mr_co_occ/twitter/'+file_name, sep='\t',header=None, names=['count','word'])

json_dic ={}


lst = []
for index, row in data.iterrows():
    node = {}
    in_1 = keyword_index[row['word'].split('-')[0]]
    in_2 = keyword_index[row['word'].split('-')[1]]
    
    node['source'] = in_1
    node['target'] = in_2
    node['value'] = row['count']
    lst.append(node)
    
json_dic['links'] = lst  

lst = []
for ky,vl in keyword_index.items():
    node = {}
    node['group'] = 'humanas'
    node['index'] = vl
    node['name'] = ky
    lst.append(node)
    
json_dic['nodes'] = lst

In [45]:
import json
json_data = json.dumps(json_dic)
json_data

'{"links": [{"source": 2, "target": 1, "value": 363}, {"source": 1, "target": 2, "value": 363}, {"source": 0, "target": 1, "value": 218}, {"source": 1, "target": 0, "value": 218}, {"source": 4, "target": 1, "value": 157}, {"source": 1, "target": 4, "value": 157}, {"source": 9, "target": 1, "value": 138}, {"source": 1, "target": 9, "value": 138}, {"source": 5, "target": 1, "value": 107}, {"source": 1, "target": 5, "value": 107}, {"source": 7, "target": 1, "value": 70}, {"source": 1, "target": 7, "value": 70}, {"source": 6, "target": 1, "value": 63}, {"source": 1, "target": 6, "value": 63}, {"source": 2, "target": 4, "value": 45}, {"source": 4, "target": 2, "value": 45}, {"source": 3, "target": 1, "value": 41}, {"source": 1, "target": 3, "value": 41}, {"source": 8, "target": 1, "value": 29}, {"source": 1, "target": 8, "value": 29}, {"source": 5, "target": 0, "value": 28}, {"source": 0, "target": 5, "value": 28}, {"source": 4, "target": 0, "value": 25}, {"source": 0, "target": 4, "value":