In [19]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import spacy
import string
from nltk.corpus import stopwords
import math
from operator import itemgetter
from collections import Counter

import pandas as pd
import re
import json

In [3]:
df = pd.read_json("data/wikipedia_sample.json", lines=True)
df.head()

Unnamed: 0,id,name,wiki-title,gender,occupation,overview
0,Q4647484,A. A. Phillips,A. A. Phillips,['Q6581097'],['Q4263842'],"'''Arthur Angell Phillips''' (1900–1985), gene..."
1,Q20638732,A. F. Mujibur Rahman,A. F. Mujibur Rahman,['Q6581097'],['Q170790'],'''Abul Faiz Mujibur Rahman''' was born on Sep...
2,Q5304508,A. L. Rao,A. L. Rao,['Q6581097'],[],'''A. Lakshman Rao''' is the former [[Chief Op...
3,Q2743474,A. N. Hornby,A. N. Hornby,['Q6581097'],"['Q12299841', 'Q937857', 'Q14089670']","'''Albert Neilson Hornby''', nicknamed '''Monk..."
4,Q4648519,A. W. Rabi Bernard,A. W. Rabi Bernard,['Q6581097'],['Q82955'],'''A. W. Rabi Bernard''' is an Indian politici...


In [4]:
def clear_overview(dirty_overview):
    # remove the <ref> </ref>
    overview = re.sub('<ref.*</ref>', '', dirty_overview)
    overview = re.sub('<ref.*/>', '', overview)

    # remove {{ }} and what is inside
    overview = re.sub('[\{].*[\}]', '', overview)
    
    # reomve [[File: ]] and [[Image: ]] and what is inside
    overview = re.sub('\[\[File:.*?\]\]', '', overview)
    overview = re.sub('\[\[Image:.*?\]\]', '', overview)

    # remove [[ ]] and keep what is inside and for the cases like [[abc | def]] keep only def and remove the rest
    overview = re.sub(r'\[\[(?:[^\]|]*\|)?([^\]|]*)\]\]', r'\1', overview)

    # remove ''' ''' 
    overview = re.sub('\'{2,3}', '', overview)

    # remove \n
    overview = re.sub('\n', '', overview)
    
    return overview

In [5]:
df.overview = df.overview.map(lambda x: clear_overview(x))
df.iloc[0].overview

'Arthur Angell Phillips (1900–1985), generally known as A. A. Phillips, was an Australian writer, critic and teacher, best known for coining the term "Cultural Cringe" in his pioneering essay The Cultural Cringe (1950), which set the early terms for post-colonial theory in Australia. He was educated at Melbourne Grammar School and at the Universities of Melbourne and Oxford, and later taught at Wesley College in Melbourne.The Cultural Cringe was first published in the Melbourne cultural affairs journal Meanjin. It explored ingrained feelings of inferiority that local intellectuals struggled against, and which were most clearly pronounced in the Australian theatre, music, art and letters. Phillips pointed out that the public widely assumed that anything produced by local dramatists, actors, musicians, artists and writers was necessarily deficient when compared against the works of European counterparts. The only ways local arts professionals could build themselves up in public esteem wa

## NLP

In [6]:
nlp = spacy.load('en')

In [7]:
def is_adjective(token):
    if not token.is_stop:
        if token.pos_ == 'ADJ':
            return True
        else:
            return False
    else:
        return False

In [8]:
def get_adjectives(overview):
    doc = nlp(overview)
    adjs = [token.lemma_ for token in doc if is_adjective(token)]
    return adjs

In [9]:
df['adjectives'] = ''

In [10]:
df['adjectives'] = df.overview.map(lambda x: get_adjectives(x))

In [12]:
df.head()

Unnamed: 0,id,name,wiki-title,gender,occupation,overview,adjectives
0,Q4647484,A. A. Phillips,A. A. Phillips,['Q6581097'],['Q4263842'],"Arthur Angell Phillips (1900–1985), generally ...","[australian, pioneering, early, post, -, colon..."
1,Q20638732,A. F. Mujibur Rahman,A. F. Mujibur Rahman,['Q6581097'],['Q170790'],Abul Faiz Mujibur Rahman was born on September...,[]
2,Q5304508,A. L. Rao,A. L. Rao,['Q6581097'],[],A. Lakshman Rao is the former Chief Operating ...,"[outwipro, ecil, keynote, 2nd, distinguished]"
3,Q2743474,A. N. Hornby,A. N. Hornby,['Q6581097'],"['Q12299841', 'Q937857', 'Q14089670']","Albert Neilson Hornby, nicknamed Monkey Hornby...",[nineteenth]
4,Q4648519,A. W. Rabi Bernard,A. W. Rabi Bernard,['Q6581097'],['Q82955'],A. W. Rabi Bernard is an Indian politician and...,"[indian, incumbent, international, national]"


In [34]:
df[df.adjectives.str.len() > 30]

Unnamed: 0,id,name,wiki-title,gender,occupation,overview,adjectives
745,Q4910214,Bill Moisan,Bill Moisan,['Q6581097'],['Q10871364'],"William Joseph Moisan, Jr. (July 30, 1925 – Ap...","[left, handed, right, handed, interested, majo..."
900,Q727006,Béla Imrédy,Béla Imrédy,['Q6581097'],"['Q188094', 'Q82955', 'Q193391', 'Q806798', 'Q...",Béla vitéz Imrédy de Ómoravicza (; 29 December...,"[catholic, young, skilled, fascist, ambitious,..."
1916,Q5442264,Felix Magalela Mafa Sibanda,Felix Magalela Mafa Sibanda,['Q6581097'],['Q82955'],"Felix Magalela Mafa Sibanda born February 5, 1...","[provincial, inner, great, great, personal, el..."
1973,Q5480874,Francis Fagan,Francis Fagan,['Q6581097'],['Q47064'],Francis Fagan was a captain in the United Stat...,"[9th, 3rd, second, extraordinary, japanese, st..."
1999,Q3527396,Frank Foster,Frank Foster (cricketer),['Q6581097'],['Q12299841'],Frank Rowbotham Foster (31 January 1889 – 3 Ma...,"[short, early, sufficient, fine, round, fast, ..."
2276,Q2653458,Ghillean Prance,Ghillean Prance,['Q6581097'],"['Q2374149', 'Q15839134']",Sir Ghillean Tolmie Prance (born 13 July 1937...,"[prominent, british, |, |, major, pmid, |, ghi..."
2384,Q5605615,Greg Gogan,Greg Gogan,['Q6581097'],['Q82955'],Greg Gogan is a Canadian politician and busine...,"[canadian, early, political, private, politica..."
2558,Q16934229,Helen Tenney,Helen Tenney,['Q6581072'],[],Helen Barrett Tenney worked for the Comintern ...,"[comintern, knowledgeable, foreign, late, span..."
2611,Q11708999,Henryk Jasiczek,Henryk Jasiczek,['Q6581097'],"['Q1930187', 'Q49757']",Henryk Jasiczek (2 March 1919 – 8 December 197...,"[polish, important, polish, popular, local, po..."
2868,Q4141387,Jack Goldstone,Jack Goldstone,['Q6581097'],"['Q1238570', 'Q2306091', 'Q18351723']","Jack A. Goldstone (born September 30, 1953) is...","[american, political, social, political, long,..."


### Tokenization

In [None]:
doc = nlp(overview)

#strings are encoded to hashes
tokens = [token.text for token in doc]

tokens[:10]

### Removing stop words

In [None]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
non_stop_words_tokens = [token for token in doc if not token.is_stop]

non_stop_words_tokens[:10]

### PoS tagging

In [None]:
pos_tagged = [(token.text, token.pos_) for token in non_stop_words_tokens]

pos_tagged[:5]

In [None]:
adjectives = [token[0] for token in pos_tagged if token[1] == 'ADJ']

adjectives[:5]

In [None]:
words = [token.lemma_ for token in doc if is_adjective(token)]

In [None]:
words

In [None]:
from collections import Counter

# five most common tokens
word_freq = Counter(words)
common_words = word_freq.most_common()

print(common_words)

## DICTIONARY

In [13]:
subjectivity_lexicon = {}

In [14]:
with open("data/subjectivity_lexicon.tff", "r") as file:
    for line in file:
        elements = line.split(" ")
        word = elements[2][6:]
        subjectivity_lexicon[word] = (elements[0][5:], elements[5][14:-1])

In [17]:
subjectivity_lexicon

{'abandoned': ('weaksubj', 'negative'),
 'abandonment': ('weaksubj', 'negative'),
 'abandon': ('weaksubj', 'negative'),
 'abase': ('strongsubj', 'negative'),
 'abasement': ('strongsubj', 'negative'),
 'abash': ('strongsubj', 'negative'),
 'abate': ('weaksubj', 'negative'),
 'abdicate': ('weaksubj', 'negative'),
 'aberration': ('strongsubj', 'negative'),
 'abhor': ('strongsubj', 'negative'),
 'abhorred': ('strongsubj', 'negative'),
 'abhorrence': ('strongsubj', 'negative'),
 'abhorrent': ('strongsubj', 'negative'),
 'abhorrently': ('strongsubj', 'negative'),
 'abhors': ('strongsubj', 'negative'),
 'abidance': ('strongsubj', 'positive'),
 'abide': ('strongsubj', 'positive'),
 'abject': ('strongsubj', 'negative'),
 'abjectly': ('strongsubj', 'negative'),
 'abjure': ('weaksubj', 'negative'),
 'abilities': ('weaksubj', 'positive'),
 'ability': ('weaksubj', 'positive'),
 'able': ('weaksubj', 'positive'),
 'abnormal': ('weaksubj', 'negative'),
 'abolish': ('weaksubj', 'negative'),
 'abominabl

In [20]:
with open('data/subjectivity_dictionary.json', 'w') as file:
    json.dump(subjectivity_lexicon, file)

In [35]:
for a in df.iloc[11075].adjectives:
    try:
        print(a, subjectivity_lexicon[a])
    except:
        pass

successful ('weaksubj', 'positive')
notable ('strongsubj', 'positive')
close ('weaksubj', 'negative')
famous ('weaksubj', 'positive')
active ('weaksubj', 'positive')
spiritual ('strongsubj', 'positive')
spiritual ('strongsubj', 'positive')
intimate ('strongsubj', 'neutral')
civil ('weaksubj', 'positive')
devoted ('strongsubj', 'positive')
holy ('weaksubj', 'positive')


In [36]:
df.iloc[11075]

id                                                     Q6697285
name                                               Lucille Kahn
wiki-title                                         Lucille Kahn
gender                                             ['Q6581072']
occupation                               ['Q33999', 'Q2259451']
overview      Lucille Kahn (1902–1995) was a successful Broa...
adjectives    [successful, notable, human, close, early, fam...
Name: 11075, dtype: object