In [24]:
# Necessary libraries.
from numpy import intersect1d, nan, unique
import pandas as pd
from collections import Counter
from ast import literal_eval
from string import punctuation
import re
import os
from os.path import join, exists
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

mpl.rcParams["figure.facecolor"] = "white"
plt.rcParams.update({"font.size": 18})

from warnings import filterwarnings

filterwarnings("ignore")

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("max_colwidth", -1)

In [2]:
# Read File.
filepath = r"D:\ML_projects\IPV-Project\annotation\data\overall.xlsx"
df = pd.read_excel(filepath)
df.shape

(6220, 12)

## Convert the datatypes of the stringed lists to lists.

In [3]:
df["tokens"] = df["tokens"].apply(literal_eval)
df["ac"] = df["ac"].apply(literal_eval)
df["ap"] = df["ap"].apply(literal_eval)
df.sample(5)

Unnamed: 0,id,source,annotator,text,tokens,ac,ap,conf,ipv,link,keyword,date
5452,5453,twitter,shr,देख्यौं नारी चरित्र ! कति निर्लज्ज र स्वार्थी हुँदा रहेछन् । उता सेटिङ मिल्यो एउटा कुरा अनि मिलेन अर्कै कुरा । दुइटै हातमा लड्डु !,"[देख्यौं, नारी, चरित्र, !, कति, निर्लज्ज, र, स्वार्थी, हुँदा, रहेछन्, ।, उता, सेटिङ, मिल्यो, एउटा, कुरा, अनि, मिलेन, अर्कै, कुरा, ।, दुइटै, हातमा, लड्डु, !]","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]",1,0,https://twitter.com/mahaabi01/status/1515694280941387777,चरित्रहीन OR चरित्र,2022-04-17 19:55:51
2205,2206,simulation_ipv,shr,तँ जस्तो चालु केटीको त मुखमा थुके पनि थुक खेर जान्छ |,"[तँ, जस्तो, चालु, केटीको, त, मुखमा, थुके, पनि, थुक, खेर, जान्छ, |]","[O, O, B-character_assasination, I-character_assasination, O, B-profanity, I-profanity, I-profanity, I-profanity, I-profanity, I-profanity, O]","[_, _, 0, 0, _, 1, 1, 1, 1, 1, 1, _]",8,1,,,
2325,2326,simulation_ipv,shr,तँ जाठी पोइ बिदेशमा हुदा यता बोका केटासंग नारिएर हिड्नेले ठुला कुरा गर्छेस |,"[तँ, जाठी, पोइ, बिदेशमा, हुदा, यता, बोका, केटासंग, नारिएर, हिड्नेले, ठुला, कुरा, गर्छेस, |]","[O, B-profanity, O, O, O, O, B-character_assasination, I-character_assasination, I-character_assasination, I-character_assasination, O, O, O, O]","[_, 0, _, _, _, _, 1, 1, 1, 1, _, _, _, _]",8,1,,,
2341,2342,simulation_ipv,shr,उसले मेरो बैंकिङ मेलिङ परिवर्तन गरि आफ्नो ठेगाना राखेर मेरो गोपनियता पुर्णतय नियन्त्रण गर्यो |,"[उसले, मेरो, बैंकिङ, मेलिङ, परिवर्तन, गरि, आफ्नो, ठेगाना, राखेर, मेरो, गोपनियता, पुर्णतय, नियन्त्रण, गर्यो, |]","[O, O, O, O, O, O, O, O, O, O, B-general_threat, I-general_threat, I-general_threat, I-general_threat, O]","[_, _, _, _, _, _, _, _, _, _, 1, 1, 1, 1, _]",6,1,,,
2546,2547,simulation_ipv,shr,तँ भुस्याहा कुकुर नै होस् ! जा मुजी जे उखाड्नु छ उखाड |,"[तँ, भुस्याहा, कुकुर, नै, होस्, !, जा, मुजी, जे, उखाड्नु, छ, उखाड, |]","[O, B-character_assasination, I-character_assasination, I-character_assasination, I-character_assasination, O, O, B-profanity, O, O, O, O, O]","[_, 1, 1, 1, 1, _, _, 0, _, _, _, _, _]",8,1,,,


In [5]:
df["ipv"].value_counts()

0    3575
1    2645
Name: ipv, dtype: int64

## Drop duplicates.

In [25]:
text = df[["text", "ipv"]]

## Drop duplicates.
shape_before = len(text)
print("Shape before: ", shape_before)

text.drop_duplicates(subset="text", keep="first", inplace=True)

shape_after = len(text)
print("Shape after:  ", shape_after)
print("Number of duplicates removed: ", shape_before - shape_after)

## Remove punctuations.
other_punctuations = "।‘’" + "!\"#$%&'()*+,./:;<=>?@[\\]^_`{|}~" + chr(8211)
to_remove = punctuation + other_punctuations + chr(8226)
text["text"] = text["text"].str.translate(str.maketrans("", "", to_remove))

## Remove multiple whitespaces + strip whitespaces at the extremes.
text["text"] = text["text"].apply(lambda x: re.sub(" +", " ", x))
text["text"] = text["text"].str.strip()

# Select only those rows having sentence lengths greater than or equal to 3.
condition = text["text"].apply(lambda x: len(x.split()) >= 3)
text = text[condition]

print("Data Shape: ", text.shape)
print(
    "Number of sentences removed with length less than 3: ", shape_after - text.shape[0]
)

Shape before:  6220
Shape after:   4532
Number of duplicates removed:  1688
Data Shape:  (4484, 2)
Number of sentences removed with length less than 3:  48


## Value Counts.

In [27]:
text.ipv.value_counts()

0    2747
1    1737
Name: ipv, dtype: int64

## Apply Stratified K-fold Cross validation and save.

### Helper Function.

In [None]:
def write_csv(df: pd.DataFrame, target_filename: str):
    """
    Writes a csv file for the dataframe.

    Parameters
    ----------
    df : DataFrame
        Source DataFrame.
    target_filename : str
        File path of the target filename.

    Returns
    -------
    None.

    """
    df.to_csv(target_filename, encoding="utf-8", header=None, index=None)

In [29]:
df = text.sample(frac=1, random_state=1234).reset_index(drop=True)

In [30]:
save_dir = r"D:\ML_projects\IPV-Project\data\text_classification"

In [26]:
# Stratified k fold.
skf = StratifiedKFold(n_splits=5)
target = df["ipv"].values

for k, (train_id, val_id) in enumerate(skf.split(df, target), 1):
    train = df.loc[train_id, :]
    val = df.loc[val_id, :]

    # Info.
    print(f"\nLength of training set : {len(train)}")
    print(f"Length of validation set : {len(val)}\n")

    # Save files.
    save_filepath = join(save_dir, str(k))
    os.makedirs(save_filepath, exist_ok=True)

    print(f"Saving files at : {save_filepath}...\n")

    write_csv(train, join(save_filepath, "train.txt"))
    write_csv(val, join(save_filepath, "val.txt"))

    print(f"Success! Save date : {utils.current_timestamp()}\n")

#%% Verbose.
if args.verbose:
    train_coll = {"0": [], "1": []}
    val_coll = {"0": [], "1": []}

    for k in range(1, 6):
        save_filepath = join(args.save_dir, str(k))
        train_df = pd.read_csv(
            join(save_filepath, "train.txt"), header=None, encoding="utf-8"
        )
        val_df = pd.read_csv(
            join(save_filepath, "val.txt"), header=None, encoding="utf-8"
        )

        train_dict = train_df.iloc[:, -1].value_counts().to_dict()
        val_dict = val_df.iloc[:, -1].value_counts().to_dict()

        train_coll["0"].append(train_dict[0])
        train_coll["1"].append(train_dict[1])

        val_coll["0"].append(val_dict[0])
        val_coll["1"].append(val_dict[1])

    print("Size of train data across 5 folds : \n", train_coll)
    print("Size of validation data across 5 folds : \n", val_coll)

Unnamed: 0,text,ipv
5969,हैन यो के हुन थालो मधेस प्बाेस को रोग सिन्दुली तिर नि सरेछ बालबालिका छाडा छोड्ने बाउ आमा लाइ सबक सिकाउ तर यो जात्रा बन्द गर,0
6134,मान्छे जीवन देखि यति निराशावादी देखिन्छ्न कि तँ जिउदै छ्स् भनेर प्रस्न गर्ने हो भने एक पल झस्किएर उत्तर दिनेछ्न्,0
4953,कति स्पेशमा मात्रै झुन्डिएर बसेका हुन के मानिसहरु कहिलेकाही त टियल स्क्राेल पनि गर्नु नि हाम्रा ट्विट खेर गय,0
3706,आशा गर्छु तेरो परिवार तेरो अगाडी मरुन,0
3551,के हुँदैछ नेपालमा,0
5779,तिम्रो मृत्यु पश्चात चै किन त्यो कहानी लेख्नु बर्बाद जवानीको किताब तिमीले चै परेन देख्नु,0
5434,वेश्याको चरित्रलाई बुझ्न सकिन्छ तर रण्डिको चरित्र बुझ्न बहुत मुस्किल हुन्छ रबिन भाइ भन्छन छिरिङ्ग दाइ,0
904,राडी तेरो मुख लुस्नु पर्ला नि धेरै बाह्र सताइस कुरा गर्दै हिड्नी होइन,1
3446,त्यसको सामान सुपर ग्लु लगाएर टाल्दे सरकार,0
3601,जाँठा डाका हरु किन कुरा बंग्याई राखे को सिधैं भन्नु नि हौं अरु ले लुट्छ हामी ले पनि लुट्नु पाउ तिर्दैनौ के को नौटंकी धत्,0
