Skip to content

Latest commit

 

History

History
148 lines (111 loc) · 2.89 KB

Stemming.rst

File metadata and controls

148 lines (111 loc) · 2.89 KB

Stemming

from google.colab import drive
drive.mount('/content/drive')

Importing pandas packages

import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/DUC 2002 E/Input Files/CSW.csv')

Creating a dictionary to update the changes made to the dataset

   data={'index':[],
     'file':[],
     'doc':[],
     'doc_dump':[],
     '1_200e':[],
     '1_400e':[],
     '2_200e':[],
     '2_400e':[],
     '3_200e':[],
     '3_400e':[]
}

Import package for stemming

import nltk
from nltk.stem import PorterStemmer
st = PorterStemmer()

Stem each word in the data to be summarized

for i in range(20):
str=df.iloc[i][2]
str1=""

wd=""
for word in str.split():
  p=""
  if word.endswith('.'):
    p=p+"."
    wd=word.replace(".","")
  elif word.endswith(','):
    p=p+","
    wd=word.replace(",","")
  elif word.endswith('?'):
    p=p+"?"
    wd=word.replace("?","")
  elif word.endswith('!'):
    p=p+"!"
    wd=word.replace("!","")
  elif word.endswith(')'):
    p=p+")"
    wd=word.replace(")","")
  else:
    p=""
    wd=word


  w=st.stem(wd)
  str1=str1+w+p+" "

data['index'].append(df.iloc[i][0])
data['file'].append(df.iloc[i][1])
data['doc'].append(str1)
data['doc_dump'].append(df.iloc[i][3])
data['1_200e'].append(df.iloc[i][4])
data['1_400e'].append(df.iloc[i][5])
data['2_200e'].append(df.iloc[i][6])
data['2_400e'].append(df.iloc[i][7])
data['3_200e'].append(df.iloc[i][8])
data['3_400e'].append(df.iloc[i][9])
for i in range(20,59):
str=df.iloc[i][2]
str1=""

wd=""
for word in str.split():
  p=""
  if word.endswith('.'):
    p=p+"."
    wd=word.replace(".","")
  elif word.endswith(','):
    p=p+","
    wd=word.replace(",","")
  elif word.endswith('?'):
    p=p+"?"
    wd=word.replace("?","")
  elif word.endswith('!'):
    p=p+"!"
    wd=word.replace("!","")
  elif word.endswith(')'):
    p=p+")"
    wd=word.replace(")","")
  else:
    p=""
    wd=word


  w=st.stem(wd)
  str1=str1+w+p+" "
data['index'].append(df.iloc[i][0])
data['file'].append(df.iloc[i][1])
data['doc'].append(str1)
data['doc_dump'].append(df.iloc[i][3])
data['1_200e'].append(df.iloc[i][4])
data['1_400e'].append(df.iloc[i][5])
data['2_200e'].append(df.iloc[i][6])
data['2_400e'].append(df.iloc[i][7])
data['3_200e'].append(df.iloc[i][8])
data['3_400e'].append(df.iloc[i][9])

Converting the dictionary created to a dataframe

d = pd.DataFrame.from_dict(data)
d.set_index('index',inplace = True)
d.sort_index(ascending=True, inplace=True)

Converting the dataframe to csv file

d.to_csv('/content/drive/MyDrive/DUC 2002 E/Input Files/CSW_S.csv')