## 1. Importing libraries

In [1]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

## 2. Getting all the documents from the directory to a dataframe

In [2]:
list_of_docs = []
for i in range(0,80):
    filepath = str('Train_docs\case_'+str(i)+'_statement.txt')
    with open(filepath) as f:
        s = f.read()
        s = re.sub(r"[\d+.\n]","",s)
        list_of_docs.append(s)

df = pd.DataFrame(list_of_docs,columns=['docs'])


## 3. Similarly, getting all the tags from the directory to the same dataframe 

In [3]:
list_of_tags = []

for j in range(0,80):
    path = str('Train_tags\case'+str(j)+'.txt')
    with open(path) as f2:
        for line in f2:
            list_of_tags.append(line)
    
df['tags'] = list_of_tags
df


Unnamed: 0,docs,tags
0,"Kurian Joseph, J Leave granted in Special Leav...","Absence, Access, Accident, Account, Acquisitio..."
1,"Abhay Manohar Sapre, J Delay in filing special...",Cause of Action
2,"Pinaki Chandra Ghose, J This criminal appeal, ...","Abetment, Abetment of Suicide, Absconding, Acc..."
3,This matter is placed before us as a Bench of...,"Decision, Exemption, Exemption Notification, I..."
4,We have heard learned Counsel for the parties...,"Child Labour, Compensation, Fundamental Right,..."
...,...,...
75,This is tenant's appeal by special leave Thou...,"Absence, Appeal, Appellate Court, Bona Fide, B..."
76,"SB Sinha, J The primal question involved in th...","Administration of Justice, Admiralty Jurisdict..."
77,"R Banumathi, J Leave granted This appeal by sp...","Absorption, Ad"
78,"Dipak Misra, J Despite completion of a decade ...","Appreciation of Evidence, Assault, Autopsy, Br..."


## 3. Splitting the tags into columns

In [4]:
for i in range(0, len(df['tags'])):
    df['tags'][i] = list(df['tags'][i].split(","))

In [5]:
type(df['tags'][7])

list

In [6]:
i=0
y = list()
for i in range(0,len(df['tags'])):
    y.append(df['tags'][i]) 

In [7]:
new_list = [item for sublist in y for item in sublist]

### Used multilabel binarizer to convert all the tags to columns.

In [8]:
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('tags')),
                          columns=mlb.classes_,
                          index=df.index))

## 4. Used lemmatization on the documents

In [9]:
lemmatizer=WordNetLemmatizer()
for i in range(len(df['docs'])):
    words=nltk.word_tokenize(df['docs'][i])
    words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    df['docs'][i]=' '.join(words)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Vectorized the documents

In [10]:
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
words = set(words)
words = list(words)

In [11]:
vect = CountVectorizer(tokenizer=tokenizer_porter, vocabulary=words)
X = vect.transform(df['docs']).toarray()


In [12]:
y = df.iloc[:,1:]

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)