# Synopsis

Convert [General Inquirer lexicon](http://www.wjh.harvard.edu/~inquirer/spreadsheet_guide.htm) into usable form. Download [the spreadsheet](http://www.wjh.harvard.edu/~inquirer/inquirerbasic.xls).

# Libraries

In [1]:
import pandas as pd
import numpy as np

# Pragmas

In [2]:
%matplotlib inline

# Process

## Import Excel file 

It has 182 ontology/sentiment columns

In [3]:
df = pd.read_excel('lexicons/inquirerbasic.xls', false_values=None)

In [4]:
df.sample(10)

Unnamed: 0,Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,...,Anomie,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined
4993,HERD#1,H4Lvd,,,,,,,Strong,,...,,,,,,,,,Noun,| 61% noun-adj: Animal group
1047,BELIEVE#1,H4Lvd,,,,,,,,,...,,,,,,,,,SUPV,| 68% verb: To be of the opinion that--to thin...
3889,FALL#3,H4Lvd,,,,,,,,,...,,,,,,,,,SUPV,| 4% verb-idiom: 'fall asleep'
4160,FLAWLESS,H4,Positiv,,,,,,,,...,,,,,,,,,Modif,|
8915,RUN#6,H4,,,,,,,,,...,,,,,,,,,SUPV,"| 16% verb: 'run away,' 'run off'--to flee, es..."
1913,COLLECTION,H4Lvd,,,,,,,,,...,,,,,,,,,Noun,|
6654,MISS#3,H4Lvd,,,,,,,,,...,,,,,,,,,Modif,"| 8% adj: ""Missing""--absent, lacking"
11185,VAGABOND,H4,,Negativ,,,,,,,...,,,,,,,,,Noun,|
6954,NICE#4,H4Lvd,Positiv,,Pstv,,,,,,...,,,PosAff,,,,,,LY,"| 10% adv: ""Nicely""--pleasantly, agreeably"
1177,BLOODY,H4Lvd,,Negativ,,,Ngtv,,,,...,,,,,,,,,Modif,|


## Handle variant terms 

In [5]:
df['split'] = df.Entry.str.split(r'[#_]+')

In [6]:
# Function to add 1 to terms without variants
def add_1(x):
    try:
        if len(x) == 1:
            x = x + [1]
    except:
        print(x)
        x = 0
    return x

In [7]:
df['split'] = df['split'].apply(add_1)
df['term_str'] = df['split'].apply(lambda x: x[0].lower(), 1)
df['term_var'] = df['split'].apply(lambda x: x[1], 1)
df = df.set_index(['term_str', 'term_var'])

In [8]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Entry,Source,Positiv,Negativ,Pstv,Affil,Ngtv,Hostile,Strong,Power,...,NegAff,PosAff,SureLw,If,NotLw,TimeSpc,FormLw,Othtags,Defined,split
term_str,term_var,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
a,1,A,H4Lvd,,,,,,,,,...,,,,,,,,DET ART,| article: Indefinite singular article--some o...,"[A, 1]"
abandon,1,ABANDON,H4Lvd,,Negativ,,,Ngtv,,,,...,,,,,,,,SUPV,|,"[ABANDON, 1]"
abandonment,1,ABANDONMENT,H4,,Negativ,,,,,,,...,,,,,,,,Noun,|,"[ABANDONMENT, 1]"
abate,1,ABATE,H4Lvd,,Negativ,,,,,,,...,,,,,,,,SUPV,|,"[ABATE, 1]"
abatement,1,ABATEMENT,Lvd,,,,,,,,,...,,,,,,,,Noun,,"[ABATEMENT, 1]"


## Convert column values to numbers

In [9]:
try:
    df.loc[df.Positiv == 'Positiv', 'polarity'] = 1
    df.loc[df.Negativ == 'Negativ', 'polarity'] = -1
    df['polarity'] = df['polarity'].fillna(0)
except TypeError as e:
    print(e, "--  This means you are re-running the script with values that have already been set.")

  res = shell.run_cell(code, store_history=store_history, silent=silent)


In [10]:
df.polarity.sample(10)

term_str    term_var
triumphant  1           1.0
resistance  1           0.0
tribe       1           0.0
obstinate   1          -1.0
compass     1           0.0
trace       1           0.0
accession   1           1.0
smitten     1           1.0
convey      1           0.0
clerk       1           0.0
Name: polarity, dtype: float64

In [11]:
VALCOLS = df.columns[2:-4]

In [12]:
for col in VALCOLS:
    df[col] = df[col].fillna(0).astype('bool').astype('int')

In [13]:
df.loc['love', VALCOLS].stack().sort_values(ascending=False)

term_var         
1         Positiv    1
5         Active     1
2         AffOth     1
1         AffTot     1
          AffGain    1
5         SocRel     1
3         AffGain    1
          AffTot     1
1         SV         1
5         Strong     1
4         AffGain    1
5         Affil      1
4         Positiv    1
          Pstv       1
          Affil      1
          Passive    1
          Pleasur    1
          EMOT       1
2         Positiv    1
          Pstv       1
          Affil      1
          Passive    1
6         SocRel     1
          Affil      1
3         Positiv    1
          Pstv       1
          Affil      1
          Passive    1
          Pleasur    1
          EMOT       1
                    ..
5         RcGain     0
          RcEnds     0
          Self       0
          RcTot      0
          RspGain    0
          RspLoss    0
          RspOth     0
          RspTot     0
          AffLoss    0
          AffPt      0
          PowPt      0
          PowAuP

In [14]:
# # Handle terms with multiple entries
# VAR_COUNTS = df.reset_index().groupby(['term_str']).term_var.count()\
#     .to_frame().rename(columns={'term_var':'n_vars'})
# SINGLES = VAR_COUNTS == 1
# SINGLES.head()
# df.loc['a'].T
# df.loc[SINGLES.n_vars.values]

## Take and combine only sentiment 

In [15]:
sents = df.query("Positiv > 0 | Negativ > 0").copy()

In [16]:
sents['sentiment'] =  sents.Positiv  - sents.Negativ

In [17]:
sents['sentiment'].sample(5)

term_str    term_var
confidence  1           1
inane       1          -1
fine        1           1
symbolize   1           1
value       3           1
Name: sentiment, dtype: int64

### Take average of variants

In [18]:
GI = sents.groupby('term_str').sentiment.mean()

### Snap values to -1, 0, or 1

In [19]:
GI = np.sign(GI).astype('int').to_frame()

In [20]:
GI.shape

(3626, 1)

In [21]:
GI.head()

Unnamed: 0_level_0,sentiment
term_str,Unnamed: 1_level_1
abandon,-1
abandonment,-1
abate,-1
abdicate,-1
abhor,-1


# Save

In [22]:
GI.to_csv('lexicons/gi.csv')