### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))
import math

### Reading files

In [2]:
df = pd.read_csv('issues.csv')
dummy = df

### Data cleaning an removing stopwords

In [3]:
for i in range(100000):
    dummy['Issue'][i] = dummy['Issue'][i].replace('/', ' ').replace(',', ' ').split()
    for j in range(len(dummy['Issue'][i])):
        dummy['Issue'][i][j] = dummy['Issue'][i][j].lower()

In [4]:
dummy

Unnamed: 0,Issue
0,"[managing, the, loan, or, lease]"
1,"[using, a, debit, or, atm, card]"
2,"[account, opening, closing, or, management]"
3,"[deposits, and, withdrawals]"
4,"[loan, servicing, payments, escrow, account]"
...,...
99995,"[cont'd, attempts, collect, debt, not, owed]"
99996,"[cont'd, attempts, collect, debt, not, owed]"
99997,"[incorrect, information, on, credit, report]"
99998,"[loan, servicing, payments, escrow, account]"


### Dividing the dataset into two interations

In [5]:
df1 = dummy[:50000]
df2 = dummy[50000:]

In [6]:
df2

Unnamed: 0,Issue
50000,"[incorrect, information, on, credit, report]"
50001,"[advertising, and, marketing]"
50002,"[loan, modification, collection, foreclosure]"
50003,"[loan, servicing, payments, escrow, account]"
50004,"[other, fee]"
...,...
99995,"[cont'd, attempts, collect, debt, not, owed]"
99996,"[cont'd, attempts, collect, debt, not, owed]"
99997,"[incorrect, information, on, credit, report]"
99998,"[loan, servicing, payments, escrow, account]"


### Storing the "Issue" items in list

In [7]:
term_lst1 = list(df1['Issue'])
term_lst2 = list(df2['Issue'])

### Creating a function to remove stopwords

In [8]:
def wordcount(total_lst):
    all_term = [i for lst in total_lst for i in lst if not i in stop_words]  
    return all_term
total_lst1 = list(set(wordcount(term_lst1)))
total_lst2 = list(set(wordcount(term_lst2)))

### Making a dataframe for word count of each term in the given corpus

In [9]:
def df_making(term,n,lst):
    initial_lst = np.zeros([50000,n], dtype = int)
    data = pd.DataFrame(initial_lst, columns=term)
    
    for i in range(50000):
        l = lst[i]
        for j in term:
            if j in l:
                data.iloc[i][j] = 1           
    return data

data1 = df_making(total_lst1,len(total_lst1),term_lst1)
data2 = df_making(total_lst2,len(total_lst2),term_lst2)

### Calculating the idf score for each term

- **idf(t) = log(N/(df + 1))**
   - idf(t) - inverse document frequency
   - N - count of corpus
   - df(t) - document frequency

In [10]:
def idf_score_calculation(terms,data):
    lst=[]
    idf_lst=[]
    
    # Calculating ocurrences of terms in the corpus
    for i in terms:
        s=0
        for j in range(50000):
            if data.iloc[j][i]==1:
                s += 1
        lst.append(s) 
        
    #  Caculating Idf score      
    N=50000
    for i in lst:
        idf_lst.append(math.log10(N / (float(i) + 1)))
    return idf_lst
    
idf_score1 = idf_score_calculation(total_lst1,data1)
idf_score2 = idf_score_calculation(total_lst2,data2)

### Calculating the dataframe for the first and second interations

In [11]:
df_inter1 = pd.DataFrame(list(zip(total_lst1, idf_score1)),columns =['Term', 'idf score - Interation1'])
df_inter2 = pd.DataFrame(list(zip(total_lst2, idf_score2)),columns =['Term', 'idf score - Interation2'])

### Merging the dataframe in the basis of "Term" column

In [12]:
final_idf_data = pd.merge(df_inter1, df_inter2, on='Term', how='outer')

In [13]:
final_idf_data

Unnamed: 0,Term,idf score - Interation1,idf score - Interation2
0,fees,2.309804,2.275724
1,missing,3.494850,3.468521
2,originator,1.710411,1.699405
3,repaying,2.333482,
4,problems,1.647432,1.580705
...,...,...,...
165,sharing,1.721704,1.786217
166,order,,4.000000
167,exchange,,4.000000
168,dispute,,4.221849


### Saving the csv file

In [14]:
final_idf_data.to_csv('idf_score_csv_file.csv')