In [1]:
import chardet
import json
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/meganmoore/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/meganmoore/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/meganmoore/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
with open("../data/metadata_w_2020articles.json", 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [4]:
df = pd.read_json('../data/metadata_w_2020articles.json')

In [5]:
df = df.T.reset_index().rename(columns={'index':'uuid'})

In [6]:
#lowercase text
df['title'] = df['title'].str.lower()
df['article_text'] = df['article_text'].str.lower()

#remove certain characters from title
df['title'] = df['title'].apply(lambda x: re.sub(r'[\n\t\r]', '', x))

In [7]:
# concatenate text and title and reshorten
df['title_text'] = (df['title'] + ' ' +  df['article_text']).apply(lambda x: x[:512])

In [8]:
# checking that they were shortened on the right dimension
test_val = df.loc[df.loc[:, 'uuid'] == 'bcbc6bb2-406e-11ee-a96e-33dec8f414a2', :]
len(test_val['title_text'][0])

512

In [9]:
# Jackie's lemmatizer
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)  
    lemmatized_text = [lemmatizer.lemmatize(token) for token in word_tokens]
    
    return ' '.join(lemmatized_text)

    # Efficiency concern? we tokenize to lemmatize, and then re-join into a string. 
    # Embeddings should be created in the loop?
lemmatized_df = df.copy()
lemmatized_df['title_text'] = lemmatized_df['title_text'].apply(lemmatize)
lemmatized_df['article_text'] = lemmatized_df['article_text'].apply(lemmatize)
lemmatized_df['title'] = lemmatized_df['title'].apply(lemmatize)


In [10]:
# write out lemmatized version to save time
lemmatized_df.to_csv('../data/metadata_w_2020articles_lemmatized.csv')

In [11]:
# write out cleaned version without lemmatization in case that made things weird
df.to_csv('../data/metadata_w_2020articles_cleaned.csv')