#    **NLI-PT (all features)**





In [None]:
import csv
import numpy as np
import pandas as pd
from google.colab import files

* # **Preprocess output from CTAP**

*The NLI-PT corpus was divided in two batches to be processed in in CTAP



In [None]:
df1 = pd.read_csv("NLI-PT_V3_1_new.csv", encoding= 'unicode_escape')

In [None]:
df1.head()

Unnamed: 0,Text_Title,Feature_Name,Value,Unnamed: 3
0,fre_B_026CVATD_cop.txt,Dependency Locality Theory: Maximal IC at Fini...,1,
1,fre_B_026CVATD_cop.txt,Dependency Locality Theory: High Adjacent IC a...,0,
2,fre_B_026CVATD_cop.txt,Dependency Locality Theory: Maximal IC at Fini...,1,
3,fre_B_026CVATD_cop.txt,Dependency Locality Theory: Maximal IC at Fini...,0,
4,fre_B_026CVATD_cop.txt,Dependency Locality Theory: Total IC at Finite...,1,


In [None]:
df2 = pd.read_csv("NLI-PT_V3_2_new.csv", encoding= 'unicode_escape')

In [None]:
df2.head()

Unnamed: 0,Text_Title,Feature_Name,Value,Unnamed: 3
0,ita_B_PA_2_22_60.2M_lei.txt,Dependency Locality Theory: Maximal IC at Fini...,0,
1,ita_B_PA_2_22_60.2M_lei.txt,Dependency Locality Theory: Total IC at Finite...,2,
2,ita_B_PA_2_22_60.2M_lei.txt,Dependency Locality Theory: Total IC at Finite...,1,
3,ita_B_PA_2_22_60.2M_lei.txt,Dependency Locality Theory: Maximal IC at Fini...,0,
4,ita_B_PA_2_22_60.2M_lei.txt,Dependency Locality Theory: Total IC at Finite...,2,


In [None]:
frames = df1, df2

In [None]:
# Join the separated batches
df = pd.concat(frames)

In [None]:
# Pivot the dataframe so each feature is a column
df = df.pivot_table(index='Text_Title', columns='Feature_Name', values='Value', aggfunc='first').reset_index()

In [None]:
df.head()

Feature_Name,Text_Title,Cohesive Complexity Feature: Mendes Additive Connectives per Token,Cohesive Complexity Feature: Mendes All Connectives per Token,Cohesive Complexity Feature: Mendes Causal Connectives per Token,Cohesive Complexity Feature: Mendes Concessive Connectives per Token,Cohesive Complexity Feature: Mendes Multi- to Single-Word Connectives,Cohesive Complexity Feature: Mendes Multi-Word Connectives per Connective,Cohesive Complexity Feature: Mendes Other Connectives per Token,Cohesive Complexity Feature: Mendes Single-Word Connectives per Connective,Cohesive Complexity Feature: Mendes Single-Word Connectives per Token,...,Syntactic Complexity Feature: Sentence Coordination Ratio,Syntactic Complexity Feature: T-unit complexity ratio,Syntactic Complexity Feature: Verb Cluster per Clause,Syntactic Complexity Feature: Verb Cluster per Sentence,Syntactic Complexity Feature: Verb Cluster per T-Unit,Syntactic Complexity Feature: Verb Phrases per Clause,Syntactic Complexity Feature: Verb Phrases per Sentence,Syntactic Complexity Feature: Verb Phrases per T-unit,Syntactic Complexity Feature: WH-Clefts per VP,Syntactic Complexity Feature: e-que Cleft per VP
0,ara_A_007CVITI_cop.txt,0.020408163265306,0.040816326530612,0.0,0,0,0,0,1,0.040816326530612,...,1.25,1.0,0.0,0.0,0.0,1.0,1.25,1,0,0.0
1,chi_A_012CVETD_cop.txt,0.052631578947368,0.210526315789474,0.0,0,0,0,0,1,0.210526315789474,...,1.14285714285714,1.0,0.0,0.0,0.0,1.0,1.14285714285714,1,0,0.0
2,chi_A_013CVETD_cop.txt,0.033333333333333,0.077777777777778,0.0,0,0,0,0,1,0.077777777777778,...,0.9,1.0,0.222222222222222,0.2,0.222222222222222,1.0,0.9,1,0,0.0
3,chi_A_020CAETD_1_cop.txt,0.056338028169014,0.154929577464789,0.014084507042254,0,0,0,0,1,0.154929577464789,...,0.705882352941177,1.08333333333333,0.076923076923077,0.058823529411765,0.083333333333333,0.923076923076923,0.705882352941177,1,0,0.083333333333333
4,chi_A_021CAETD_1_cop.txt,0.024096385542169,0.08433734939759,0.0,0,0,0,0,1,0.08433734939759,...,0.722222222222222,1.07692307692308,0.285714285714286,0.222222222222222,0.307692307692308,0.928571428571429,0.722222222222222,1,0,0.076923076923077


In [None]:
len(df)

3068

In [None]:
# Extract L1's from file names
def label_l1 (row):
  language_dict = {'ara':1,'chi':2, 'dut':3, 'eng':4, 'fre':5, 'ger':6, 'ita':7, 'jap':8, 'kor':9, 'pol':10, 'rom':11, 'rus':12, 'spa':13, 'swe':14, 'tet':15}
  for lang in language_dict.items():
    if row['Text_Title'].startswith(lang[0]):
      return lang[1]       

In [None]:
# Add L1's as new feature
df['L1'] = df.apply (lambda row: label_l1(row), axis=1)

In [None]:
# Extract proficiency levels from file names
def proficiency(row):
proficiency_dict = {'_A_':1, '_B_':2, '_C_':3}  
for prof in proficiency_dict.items():
  if row['Text_Title'].startswith(prof[0]):
    return prof[1]

In [None]:
# Add proficiency levels as new feature
df['Proficiency'] = df.apply (lambda row: proficiency(row), axis=1)

In [None]:
df.head()

Unnamed: 0,ï»¿,Text_Title,Cohesive Complexity Feature: Mendes Additive Connectives per Token,Cohesive Complexity Feature: Mendes All Connectives per Token,Cohesive Complexity Feature: Mendes Causal Connectives per Token,Cohesive Complexity Feature: Mendes Concessive Connectives per Token,Cohesive Complexity Feature: Mendes Multi- to Single-Word Connectives,Cohesive Complexity Feature: Mendes Multi-Word Connectives per Connective,Cohesive Complexity Feature: Mendes Other Connectives per Token,Cohesive Complexity Feature: Mendes Single-Word Connectives per Connective,...,Syntactic Complexity Feature: Verb Cluster per Clause,Syntactic Complexity Feature: Verb Cluster per Sentence,Syntactic Complexity Feature: Verb Cluster per T-Unit,Syntactic Complexity Feature: Verb Phrases per Clause,Syntactic Complexity Feature: Verb Phrases per Sentence,Syntactic Complexity Feature: Verb Phrases per T-unit,Syntactic Complexity Feature: WH-Clefts per VP,Syntactic Complexity Feature: e-que Cleft per VP,L1,Proficiency
0,0,ara_A_006CAETF_cop.txt,0.043165,0.086331,0.0,0.0,0,0,0,1,...,0.3,0.230769,0.3,1.0,0.769231,1.0,0,0.0,1,1
1,1,ara_A_006CAETI_cop.txt,0.073171,0.186992,0.00813,0.0,0,0,0,1,...,0.266667,0.571429,0.285714,0.933333,2.0,1.0,0,0.0,1,1
2,2,ara_A_007CVITI_cop.txt,0.020408,0.040816,0.0,0.0,0,0,0,1,...,0.0,0.0,0.0,1.0,1.25,1.0,0,0.0,1,1
3,3,ara_A_008CVETD_cop.txt,0.1,0.111111,0.0,0.0,0,0,0,1,...,0.076923,0.2,0.076923,1.0,2.6,1.0,0,0.0,1,1
4,4,ara_A_008CVETF_cop.txt,0.100529,0.148148,0.0,0.0,0,0,0,1,...,0.074074,0.2,0.076923,0.962963,2.6,1.0,0,0.0,1,1


In [None]:
df.dropna(inplace=True)

In [None]:
# Number of rows left
len(df)

2753

In [None]:
df = df.drop('ï»¿', axis=1)

In [None]:
# Save new formatted data frame
df.to_csv('NLI-PT_all_features_new_noNaN.csv', encoding = 'utf-8-sig') 
files.download('NLI-PT_all_features_new_noNaN.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>