In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', None)

## Loading Raw Dataset

In [None]:
with open("../datalake/raw_data.txt", "r", encoding="UTF-8") as fp:
    lines = fp.readlines()
    df = pd.DataFrame(lines, columns=["text"])
df["text"] = df["text"].str.strip()
df

## Removing empty rows

In [None]:
df_without_empty_strings = df.copy()
df_without_empty_strings.dropna()
df_without_empty_strings = df_without_empty_strings[df_without_empty_strings["text"] != "\"\""]
df_without_empty_strings = df_without_empty_strings[df_without_empty_strings["text"] != ""]
df_without_empty_strings

In [None]:
len(df) - len(df_without_empty_strings)

### Info about dataset without empty rows

In [None]:
df_without_empty_strings.info()

In [None]:
df_without_empty_strings.head()

## Removing duplicates from the data

In [None]:
df_without_empty_strings[df_without_empty_strings["text"].str.contains("---")]

In [None]:
df_without_duplicates = df_without_empty_strings.drop_duplicates().copy()
df_without_duplicates["length"] = df_without_duplicates["text"].map(str.strip).map(lambda x: len(x.split()))
df_without_duplicates

### Number of deleted duplicates

In [None]:
len(df) - len(df_without_duplicates)

### Info about dataset without duplicates

In [None]:
(len(df) - len(df_without_duplicates))/len(df)

In [None]:
df_without_duplicates.describe()

In [None]:
df_without_duplicates.idxmax()

In [None]:
df_without_duplicates.loc[444185]

In [None]:
df_without_duplicates.loc[65505]

### Distribution

In [None]:
bins = np.histogram_bin_edges(df_without_duplicates['length'], bins=10)
bins

In [None]:
sns.histplot(data=df_without_duplicates, x="length", bins=bins)
plt.savefig("histogram_with_outlier.svg", format='svg', bbox_inches="tight")

In [None]:
sns.histplot(data=df_without_duplicates[df_without_duplicates["length"] <= 60], x="length", bins=60)
plt.savefig("histogram_first_bin.svg", format='svg', bbox_inches="tight")

In [None]:
df_with_7 = df_without_duplicates[df_without_duplicates["length"] <= 7]
df_with_7

In [None]:
df_with_7[df_with_7["text"].str.contains("Dr\.")]

In [None]:
df_without_7 = df_without_duplicates[df_without_duplicates["length"] > 7]
df_without_7

In [None]:
df_without_7[df_without_7["text"].str.contains("Dr\.")]

In [None]:
df_without_20 = df_without_duplicates[df_without_duplicates["length"] > 20]
df_without_20[df_without_20["text"].str.contains("Dr\.")]

In [None]:
sns.histplot(data=df_without_7, x="length", bins=40)
plt.savefig("histogram_without_outlier.svg", format='svg', bbox_inches="tight")

In [None]:
df_without_7.describe()

In [None]:
bins_without_7 = np.histogram_bin_edges(df_without_7['length'], bins=40)
bins_without_7

In [None]:
len(df_without_7)/len(df)

In [None]:
test_data = df_without_7[:700].copy()
test_data["labels"] = ""
test_data.to_csv("../datalake/test.csv")