## Preprocess and Combine whole data

In [1]:
import os
import numpy as np

# Define the path to the train folder
train_folder_path = './db/train/'

# Get a list of all filenames in the train folder
filenames = os.listdir(train_folder_path)

# Filter out only the files (excluding directories)
filenames = [filename for filename in filenames if os.path.isfile(
    os.path.join(train_folder_path, filename))]

# Convert the list of filenames to a NumPy array
filenames_array = np.array(filenames)

# Display the filenames array
print(filenames_array)

['Business Documents.csv' 'Creative Documents.csv'
 'Educational Documents.csv' 'Financial Documents.csv'
 'Govt Documents.csv' 'Legal Documents.csv' 'Medical Documents.csv'
 'News Documents.csv' 'Scientific Doc2.csv' 'Scientific Documents.csv'
 'Technical Documents.csv']


In [7]:
file_names_without_extension = [filename.split(
    '.')[0] for filename in filenames_array]
file_names_without_extension

['Business Documents',
 'Creative Documents',
 'Educational Documents',
 'Financial Documents',
 'Govt Documents',
 'Legal Documents',
 'Medical Documents',
 'News Documents',
 'Scientific Doc2',
 'Scientific Documents',
 'Technical Documents']

In [2]:
import pandas as pd


df = pd.read_csv('./db/train/'+filenames_array[0])
df

Unnamed: 0,content,category
0,Message-ID: <24216240.1075855687451.JavaMail.e...,
1,Message-ID: <25140503.1075855687800.JavaMail.e...,
2,Message-ID: <19034252.1075855687825.JavaMail.e...,
3,Message-ID: <719350.1075855687850.JavaMail.eva...,
4,Message-ID: <10523086.1075855687873.JavaMail.e...,
...,...,...
1011,Message-ID: <15816310.1075855374294.JavaMail.e...,
1012,Message-ID: <6521706.1075855374316.JavaMail.ev...,
1013,Message-ID: <21543395.1075855374340.JavaMail.e...,
1014,Message-ID: <25363451.1075855374674.JavaMail.e...,


In [3]:
from sklearn.model_selection import train_test_split
from creme import metrics
import creme
from creme import naive_bayes
from creme import feature_extraction
from creme import compose
import math
import pickle
from nltk.stem.porter import PorterStemmer
import string
from nltk.corpus import stopwords
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
# stopwords.words('english')
# string.punctuation
ps = PorterStemmer()
# ps.stem('worries')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dilshad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dilshad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

In [33]:
def preprocesse_df(df):
    df = df.dropna()
    df.duplicated().sum()
    df = df.drop_duplicates(keep='first')
    df['transformed_content'] = df['content'].apply(transform_text)
    # Remove duplicate rows
    
    processed_df = df[df['content'].str.len() >= 1000]

    processed_df= processed_df[['transformed_content', 'category']]

    processed_df.drop_duplicates(inplace=True)
    return processed_df

In [34]:
def clean_df(df, label):
    # Remove columns with empty values
    df.dropna(axis=1, how='all', inplace=True)

    # Remove rows with NaN values
    df.dropna(axis=0, how='any', inplace=True)

    # Remove duplicate rows
    df.drop_duplicates(inplace=True)

    # Set 'category' column with the specified label
    df['category'] = label

    # Rename 'Content' column to lowercase 'content'
    df.rename(columns={'Content': 'content',
              'Category': 'category'}, inplace=True)

    return df

In [39]:
df = pd.read_csv('./db/train/'+filenames_array[4])
new_df = clean_df(df, file_names_without_extension[4] )
new_df = preprocesse_df(new_df)
new_df =pd.concat([df, new_df], ignore_index=True)
new_df

Unnamed: 0,content,category,transformed_content
0,List of Documents required as ID and Address p...,Govt Documents,
1,List of Documents required as ID and Address p...,Govt Documents,
2,Documents List of documents accepted\r\nProof ...,Govt Documents,
3,,Govt Documents,list document requir id address proof proof id...
4,,Govt Documents,document list document accept proof ident one ...


In [40]:
import pandas as pd

full_df = pd.DataFrame()  # Create an empty DataFrame

# n = len(filenames_array)
n = 3
for i in range(0, n):
    print(i, file_names_without_extension[i], 'started')
    df = pd.read_csv('./db/train/' + filenames_array[i])
    new_df = clean_df(df, file_names_without_extension[i])
    new_df = preprocesse_df(new_df)

    # Assign the result back to full_df
    full_df = pd.concat([full_df, new_df], ignore_index=True)
    print(i, file_names_without_extension[i],
          'successful ------------------------------')

0 Business Documents started
0 Business Documents successful ------------------------------
1 Creative Documents started
1 Creative Documents successful ------------------------------
2 Educational Documents started
2 Educational Documents successful ------------------------------


In [45]:
unique_categories = full_df['category'].unique()
print(unique_categories)

['Business Documents' 'Creative Documents' 'Educational Documents']


In [46]:
category_counts = full_df['category'].value_counts()
print(category_counts)

category
Creative Documents       551
Business Documents       399
Educational Documents     13
Name: count, dtype: int64


In [47]:
full_df.to_csv('full_df.csv', index=False)