In [1]:
import os
import sys

# adding classes folder to system path
sys.path.insert(0, os.path.abspath('..') + '/gispy')

import seaborn as sn
import pandas as pd

from scipy.stats import zscore
from gist import GIS, GIST

In [2]:
df = GIST(docs_path='../data/documents', config_path='../gispy/gist_config.json').compute_scores()

loading parameters and models...
reading input text files...
------------------------------
number of documents: 50
document batch size: 10
document(s) in each batch: 5
------------------------------
processing batch #1
processing batch #2
processing batch #3
processing batch #4
processing batch #5
processing batch #6
processing batch #7
processing batch #8
processing batch #9
processing batch #10
normalizing values of indices...
computing the final GIS...
computing GIS for all documents is done. results are saved at /results.csv


In [2]:
df = pd.read_csv('../gispy/results.csv')

In [3]:
# comparing the GIS of Discussion vs. Methods sections of a collection of documents
# Discussion sections are supposed to be more gisty and have a higher GIS value.

d_scores = []
m_scores = []

for idx, row in df.iterrows():
    if 'd_' in row['d_id']:
        d_scores.append(row['gis'])
    else:
        m_scores.append(row['gis'])

print('avg Discussion score: {}'.format(sum(d_scores) / len(d_scores)))
print('avg Methods score: {}'.format(sum(m_scores) / len(m_scores)))

avg Discussion score: -0.9624967105187693
avg Methods score: -0.7077170913286204


In [4]:
d_scores = []
m_scores = []
scores = []

for idx, row in df.iterrows():
    scores.append(row['gis_zscore'])
    if 'd_' in row['d_id']:
        d_scores.append(row['gis_zscore'])
    else:
        m_scores.append(row['gis_zscore'])

print('avg Discussion score: {}'.format(sum(d_scores) / len(d_scores)))
print('avg Methods score: {}'.format(sum(m_scores) / len(m_scores)))

avg Discussion score: -0.1763547847314468
avg Methods score: 0.1763547847314513


In [3]:
# sn.histplot(df['gis_zscore'])
# sn.histplot(df['gis_zscore'])

In [11]:
df_new = pd.DataFrame(columns=['coh_expert', 'coh_turker', 'gis', 'gis_z'])
for idx, row in df.iterrows():
    vals = row['d_id'].strip('.txt').split('_')
    df_new = df_new.append({'coh_expert': vals[2], 'coh_turker': vals[3], 'gis': row['gis'], 'gis_z': row['gis_zscore']}, ignore_index=True)

In [22]:
df_new['gis'].values

array([-0.49870413, -0.49824469, -0.35813221, ..., -0.7283791 ,
       -1.00103158, -0.93645822])

In [51]:
import numpy as np
import matplotlib.pyplot as plt  
from sklearn.linear_model import LinearRegression

x = df_new['gis_z'].values.reshape(-1, 1)
y = df_new['coh_expert'].values.reshape(-1, 1)
model = LinearRegression().fit(X, Y)
y_pred = model.predict(X)

# Obtain the coefficient of determination by calling the model with the score() function, then print the coefficient:
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

# Print the Intercept:
print('intercept:', model.intercept_)

# Print the Slope:
print('slope:', model.coef_) 

# Predict a Response and print it:
y_pred = model.predict(x)
print('Predicted response:', y_pred, sep='\n')

coefficient of determination: -0.02178850564453283
intercept: [3.31404241]
slope: [[-0.05703147]]
Predicted response:
[[3.32483719]
 [3.34606311]
 [3.23638075]
 ...
 [3.40355987]
 [3.44644501]
 [3.44756526]]


In [46]:
type(X), type(Y)

(numpy.ndarray, numpy.ndarray)

### Computing GIS using pre-computed Coh-Metrix indexes

In [4]:
df = pd.read_csv("../data/mturk_all.csv")

In [5]:
a = GIS().score(df, wolfe=True)

In [6]:
a.head(5)

Unnamed: 0,TextID,DESPC,DESSC,DESWC,DESPL,DESPLd,DESSL,DESSLd,DESWLsy,DESWLsyd,...,WRDHYPv,WRDHYPnv,RDFRE,RDFKGL,RDL2,zSMCAUSlsa,zSMCAUSwn,zWRDIMGc,zWRDHYPnv,GIS
0,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,8,14,355,1.75,1.389,25.5,14.325,1.614,0.924,...,1.842,2.275,44.553001,13.344,6.526,-0.975,-0.229166,1.242778,1.661539,-4.25215
1,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,11,25,425,2.273,1.191,17.24,10.978,1.532,0.827,...,1.693,2.405,59.973,9.118,12.206,-0.675,0.8125,0.817916,2.161538,-6.053954
2,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,4,17,313,4.25,2.63,18.471001,9.677,1.431,0.826,...,1.817,1.643,67.084,8.476,21.061001,-0.15,-0.208333,-1.270545,-0.769231,4.363109
3,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,18,26,744,1.444,0.616,29.0,11.631,1.777,1.03,...,1.643,2.383,27.457001,16.538,8.316,-1.375,-0.604167,0.801352,2.076923,-5.268108
4,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,5,16,272,3.2,1.789,17.25,9.936,1.346,0.702,...,1.467,1.603,75.708,6.923,20.931,-1.45,0.15625,-0.696967,-0.923077,1.551794


In [5]:
a.to_csv('mturk_all.csv')

In [6]:
from nltk.corpus import wordnet
cb = wordnet.synset('cookbook.n.01')
ib = wordnet.synset('instruction_book.n.01')
cb.wup_similarity(ib)

0.9166666666666666

In [7]:
a = wn.synsets('oppose', wn.VERB)
b = wn.synsets('cite', wn.VERB)

In [11]:
len(a), len(b)
#wordnet.wup_similarity(a[0], b[0])

(6, 7)