In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import spacy
import string
from nltk.corpus import stopwords
import math
from operator import itemgetter
from collections import Counter

import pandas as pd
import re

In [None]:
df = pd.read_json("data/overview_wikipedia_sample.json", lines=True)
df

In [None]:
def clear_overview(dirty_overview):
    # remove the <ref> </ref>
    overview = re.sub('<ref.*</ref>', '', dirty_overview)
    overview = re.sub('<ref.*/>', '', overview)

    # remove {{ }} and what is inside
    overview = re.sub('[\{].*[\}]', '', overview)
    
    # reomve [[File: ]] and [[Image: ]] and what is inside
    overview = re.sub('\[\[File:.*?\]\]', '', overview)
    overview = re.sub('\[\[Image:.*?\]\]', '', overview)

    # remove [[ ]] and keep what is inside and for the cases like [[abc | def]] keep only def and remove the rest
    overview = re.sub(r'\[\[(?:[^\]|]*\|)?([^\]|]*)\]\]', r'\1', overview)

    # remove ''' ''' 
    overview = re.sub('\'{2,3}', '', overview)

    # remove \n
    overview = re.sub('\n', '', overview)
    
    return overview

In [None]:
df.overview = df.overview.map(lambda x: clear_overview(x))
df.iloc[0].overview

## NLP

In [None]:
nlp = spacy.load('en')

In [None]:
def is_adjective(token):
    if not token.is_stop:
        if token.pos_ == 'ADJ':
            return True
        else:
            return False
    else:
        return False

In [None]:
def get_adjectives(overview):
    doc = nlp(overview)
    adjs = [token.lemma_ for token in doc if is_adjective(token)]
    return adjs

In [None]:
df['adjectives'] = ''

In [None]:
df['adjectives'] = df.overview.map(lambda x: get_adjectives(x))

In [None]:
df[df.adjectives.str.len() > 5].adjectives

### Tokenization

In [None]:
doc = nlp(overview)

#strings are encoded to hashes
tokens = [token.text for token in doc]

tokens[:10]

### Removing stop words

In [None]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
non_stop_words_tokens = [token for token in doc if not token.is_stop]

non_stop_words_tokens[:10]

### PoS tagging

In [None]:
pos_tagged = [(token.text, token.pos_) for token in non_stop_words_tokens]

pos_tagged[:5]

In [None]:
adjectives = [token[0] for token in pos_tagged if token[1] == 'ADJ']

adjectives[:5]

In [None]:
words = [token.lemma_ for token in doc if is_adjective(token)]

In [None]:
words

In [None]:
from collections import Counter

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common()

print(common_words)

## DICTIONARY

In [None]:
subjectivity_lexicon = {}

In [None]:
with open("data/subjectivity_lexicon.tff", "r") as file:
    for line in file:
        elements = line.split(" ")
        word = elements[2][6:]
        subjectivity_lexicon[word] = (elements[0][5:], elements[5][14:-1])

In [None]:
subjectivity_lexicon

In [None]:
for a in df.iloc[24].adjectives:
    try:
        print(a, subjectivity_lexicon[a])
    except:
        pass

In [None]:
df.iloc[24]