In [1]:
import os
import sys

# adding classes folder to system path
sys.path.insert(0, os.path.abspath('..') + '/gispy')

import pandas as pd
from scipy.stats import zscore
from gist import GIS, GIST

In [2]:
df = GIST(docs_path='../data/documents', config_path='../gispy/gist_config.json').compute_scores()

loading parameters and models...
reading input text files...
------------------------------
number of documents: 50
document batch size: 10
document(s) in each batch: 5
------------------------------
processing batch #1
processing batch #2
processing batch #3
processing batch #4
processing batch #5
processing batch #6
processing batch #7
processing batch #8
processing batch #9
processing batch #10
normalizing values of indices...
computing the final GIS...
computing GIS for all documents is done. results are saved at /results.csv


In [5]:
df = pd.read_csv('results.csv')

In [6]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,d_id,text,gis,gis_zscore
0,0,m_15.txt,The purpose of the current study was to unders...,-1.245031,-2.870977
1,1,d_19.txt,Early reports have shown a relationship betwee...,-0.951089,-1.706171
2,2,d_0.txt,A third feature of our data that may constrain...,-0.57202,0.411909


In [11]:
# comparing the GIS of Discussion vs. Methods sections of a collection of documents
# Discussion sections are supposed to be more gispty and have a higher GIS value.

d_scores = []
m_scores = []

for idx, row in df.iterrows():
    if 'd_' in row['d_id']:
        d_scores.append(row['gis'])
    else:
        m_scores.append(row['gis'])

print('avg Discussion score: {}'.format(sum(d_scores) / len(d_scores)))
print('avg Methods score: {}'.format(sum(m_scores) / len(m_scores)))

avg Discussion score: -0.5528727796894336
avg Methods score: -0.6167237579259587


In [12]:
d_scores = []
m_scores = []

for idx, row in df.iterrows():
    if 'd_' in row['d_id']:
        d_scores.append(row['gis_zscore'])
    else:
        m_scores.append(row['gis_zscore'])

print('avg Discussion score: {}'.format(sum(d_scores) / len(d_scores)))
print('avg Methods score: {}'.format(sum(m_scores) / len(m_scores)))

avg Discussion score: 0.09685385714427064
avg Methods score: -0.09685385714427004


In [13]:
d_scores

[-1.7061707465939768,
 0.4119086612850305,
 3.5264479819479018,
 0.7316646097724684,
 -2.386803032251781,
 2.913292740109908,
 4.06849387608524,
 -4.388812660878576,
 0.17665493081883832,
 3.6534511922491673,
 5.62916482583163,
 -2.9421800569251717,
 -3.374868709926517,
 0.9842013171724768,
 -5.747488951362943,
 3.6511786644669484,
 -3.763035794175507,
 0.6709206238144458,
 -3.4010673323734646,
 -2.1732344284066083,
 4.53867352794812,
 -3.903499555517017,
 2.509666958354424,
 -2.0260164750314846,
 4.7688042621932105]

In [14]:
m_scores

[-2.870976717883112,
 -1.91317924882866,
 3.3232171255235112,
 4.4486314103237135,
 1.8492779462410651,
 0.2995320464668886,
 -2.2183922036721078,
 1.3677082633600033,
 2.3722494453087446,
 -1.2564124425621437,
 2.6652847311483243,
 -4.830499844862247,
 4.232723970797942,
 0.3769922586959005,
 1.9907772941321904,
 -5.3268528183489225,
 0.6196401111403966,
 -7.220587271097528,
 -2.7240379999562303,
 1.694336418057519,
 -0.6010227697213099,
 -2.0823829760332995,
 4.73090328931009,
 -1.1064737369941886,
 -0.24180270915329086]

### Computing GIS using pre-computed Coh-Metrix indexes

In [4]:
df = pd.read_csv("../data/mturk_all.csv")

In [5]:
a = GIS().score(df, wolfe=True)

In [6]:
a.head(5)

Unnamed: 0,TextID,DESPC,DESSC,DESWC,DESPL,DESPLd,DESSL,DESSLd,DESWLsy,DESWLsyd,...,WRDHYPv,WRDHYPnv,RDFRE,RDFKGL,RDL2,zSMCAUSlsa,zSMCAUSwn,zWRDIMGc,zWRDHYPnv,GIS
0,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,8,14,355,1.75,1.389,25.5,14.325,1.614,0.924,...,1.842,2.275,44.553001,13.344,6.526,-0.975,-0.229166,1.242778,1.661539,-4.25215
1,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,11,25,425,2.273,1.191,17.24,10.978,1.532,0.827,...,1.693,2.405,59.973,9.118,12.206,-0.675,0.8125,0.817916,2.161538,-6.053954
2,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,4,17,313,4.25,2.63,18.471001,9.677,1.431,0.826,...,1.817,1.643,67.084,8.476,21.061001,-0.15,-0.208333,-1.270545,-0.769231,4.363109
3,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,18,26,744,1.444,0.616,29.0,11.631,1.777,1.03,...,1.643,2.383,27.457001,16.538,8.316,-1.375,-0.604167,0.801352,2.076923,-5.268108
4,C:\gwu\cohmetrix\CohMetrix2021\coh_input_2\301...,5,16,272,3.2,1.789,17.25,9.936,1.346,0.702,...,1.467,1.603,75.708,6.923,20.931,-1.45,0.15625,-0.696967,-0.923077,1.551794


In [5]:
a.to_csv('mturk_all.csv')