In [2]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import spacy
import string
from nltk.corpus import stopwords
import math
from operator import itemgetter
from collections import Counter

import pandas as pd
import re

In [3]:
df = pd.read_json("data/wikipedia_sample.json", lines=True)
df

Unnamed: 0,id,name,wiki-title,gender,occupation,overview
0,Q4647484,A. A. Phillips,A. A. Phillips,['Q6581097'],['Q4263842'],"'''Arthur Angell Phillips''' (1900–1985), gene..."
1,Q20638732,A. F. Mujibur Rahman,A. F. Mujibur Rahman,['Q6581097'],['Q170790'],'''Abul Faiz Mujibur Rahman''' was born on Sep...
2,Q5304508,A. L. Rao,A. L. Rao,['Q6581097'],[],'''A. Lakshman Rao''' is the former [[Chief Op...
3,Q2743474,A. N. Hornby,A. N. Hornby,['Q6581097'],"['Q12299841', 'Q937857', 'Q14089670']","'''Albert Neilson Hornby''', nicknamed '''Monk..."
4,Q4648519,A. W. Rabi Bernard,A. W. Rabi Bernard,['Q6581097'],['Q82955'],'''A. W. Rabi Bernard''' is an Indian politici...
...,...,...,...,...,...,...
13912,Q341346,Česlovas Sasnauskas,Česlovas Sasnauskas,['Q6581097'],"['Q36834', 'Q765778']","'''Česlovas Sasnauskas''' (19 July 1867, in [[..."
13913,Q3023449,Deo Van Long,Đèo Văn Long,['Q6581097'],['Q82955'],'''Đèo Văn Long''' (15 March 1887 &ndash; 20 N...
13914,Q38304,Ēriks Pētersons,Ēriks Pētersons,['Q6581097'],"['Q937857', 'Q11774891']",'''Ēriks Pētersons''' (1909 in [[Riga]] – 1987...
13915,Q9394622,Łukasz Janik,Łukasz Janik,['Q6581097'],['Q11338576'],"'''Łukasz Janik''' (born 17 December 1985, in ..."


In [None]:
def clear_overview(dirty_overview):
    # remove the <ref> </ref>
    overview = re.sub('<ref.*</ref>', '', dirty_overview)
    overview = re.sub('<ref.*/>', '', overview)

    # remove {{ }} and what is inside
    overview = re.sub('[\{].*[\}]', '', overview)
    
    # reomve [[File: ]] and [[Image: ]] and what is inside
    overview = re.sub('\[\[File:.*?\]\]', '', overview)
    overview = re.sub('\[\[Image:.*?\]\]', '', overview)

    # remove [[ ]] and keep what is inside and for the cases like [[abc | def]] keep only def and remove the rest
    overview = re.sub(r'\[\[(?:[^\]|]*\|)?([^\]|]*)\]\]', r'\1', overview)

    # remove ''' ''' 
    overview = re.sub('\'{2,3}', '', overview)

    # remove \n
    overview = re.sub('\n', '', overview)
    
    return overview

## NLP

In [4]:
nlp = spacy.load('en')

In [10]:
def is_adjective(token):
    if not token.is_stop:
        if token.pos_ == 'ADJ':
            return True
        else:
            return False
    return False

In [11]:
def get_adjectives(overview):
    doc = nlp(overview)
    adjs = [token.lemma_ for token in doc if is_adjective(token)]
    return adjs

In [12]:
df['adjectives'] = ''

In [13]:
df['adjectives'] = df.overview.map(lambda x: get_adjectives(x))

In [14]:
df

Unnamed: 0,id,name,wiki-title,gender,occupation,overview,adjectives
0,Q4647484,A. A. Phillips,A. A. Phillips,['Q6581097'],['Q4263842'],"'''Arthur Angell Phillips''' (1900–1985), gene...","[cultural, cringe|cultural, pioneering, early,..."
1,Q20638732,A. F. Mujibur Rahman,A. F. Mujibur Rahman,['Q6581097'],['Q170790'],'''Abul Faiz Mujibur Rahman''' was born on Sep...,[en.banglapedia.org|access]
2,Q5304508,A. L. Rao,A. L. Rao,['Q6581097'],[],'''A. Lakshman Rao''' is the former [[Chief Op...,"[important, instrumental, compliant, top, tota..."
3,Q2743474,A. N. Hornby,A. N. Hornby,['Q6581097'],"['Q12299841', 'Q937857', 'Q14089670']","'''Albert Neilson Hornby''', nicknamed '''Monk...","[nineteenth, australian]"
4,Q4648519,A. W. Rabi Bernard,A. W. Rabi Bernard,['Q6581097'],['Q82955'],'''A. W. Rabi Bernard''' is an Indian politici...,"[indian, incumbent, party.<ref>[http://www.tn...."
...,...,...,...,...,...,...,...
13912,Q341346,Česlovas Sasnauskas,Česlovas Sasnauskas,['Q6581097'],"['Q36834', 'Q765778']","'''Česlovas Sasnauskas''' (19 July 1867, in [[...",[lithuania|lithuanian]
13913,Q3023449,Deo Van Long,Đèo Văn Long,['Q6581097'],['Q82955'],'''Đèo Văn Long''' (15 March 1887 &ndash; 20 N...,"[french, non, gagna, ensuite, name=""familysite..."
13914,Q38304,Ēriks Pētersons,Ēriks Pētersons,['Q6581097'],"['Q937857', 'Q11774891']",'''Ēriks Pētersons''' (1909 in [[Riga]] – 1987...,"[latvia]]n, international, latvia, national, n..."
13915,Q9394622,Łukasz Janik,Łukasz Janik,['Q6581097'],['Q11338576'],"'''Łukasz Janik''' (born 17 December 1985, in ...",[professional]


In [15]:
df.iloc[4].adjectives

['indian',
 'incumbent',
 'party.<ref>[http://www.tn.gov.in',
 'webarchive',
 'web/20120920112326/http://www.tn.gov.in',
 'international',
 'national',
 'national',
 'nadu',
 'rabi']

In [None]:
df[df.adjectives.str.len() > 5].adjectives

### Tokenization

In [None]:
doc = nlp(overview)

#strings are encoded to hashes
tokens = [token.text for token in doc]

tokens[:10]

### Removing stop words

In [None]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
non_stop_words_tokens = [token for token in doc if not token.is_stop]

non_stop_words_tokens[:10]

### PoS tagging

In [None]:
pos_tagged = [(token.text, token.pos_) for token in non_stop_words_tokens]

pos_tagged[:5]

In [None]:
adjectives = [token[0] for token in pos_tagged if token[1] == 'ADJ']

adjectives[:5]

In [None]:
words = [token.lemma_ for token in doc if is_adjective(token)]

In [None]:
words

In [None]:
from collections import Counter

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common()

print(common_words)

## DICTIONARY

In [None]:
subjectivity_lexicon = {}

In [None]:
with open("data/subjectivity_lexicon.tff", "r") as file:
    for line in file:
        elements = line.split(" ")
        word = elements[2][6:]
        subjectivity_lexicon[word] = (elements[0][5:], elements[5][14:-1])

In [None]:
subjectivity_lexicon

In [None]:
for a in df.iloc[24].adjectives:
    try:
        print(a, subjectivity_lexicon[a])
    except:
        pass

In [None]:
df.iloc[24]