In [None]:
# load the dataset and take a quick view
import pandas as pd

df = pd.read_csv("enterpret.csv")
df.head()

In [None]:
# check the columns
print(f"Columns: {df.columns}")
print(f"Rows and Columns: {df.shape}")

In [None]:
# check columns which are all empty so that can be dropped
empty_cols = [col for col in df.columns if df[col].isnull().all()]
print(f"Empty columns: {empty_cols} \nNo. of empty columns: {len(empty_cols)} ")

In [None]:
# drop columns having all null rows
df.dropna(axis=1, how="all", inplace=True)

In [None]:
# now checking the shape again
df.shape

In [None]:
# Checking the unique values of columns that appear to have categorical features
unique_sources = df["Source"].unique()
unique_types = df["Type"].unique()
unique_languages = df["Language"].unique()
unique_sentiments = df["Record Sentiment"].unique()

print("Unique sources:", unique_sources, len(unique_sources))
print("Unique types:", unique_types, len(unique_types))
print("Unique languages:", unique_languages, len(unique_languages))
print("Unique sentiments:", unique_sentiments, len(unique_sentiments))

In [None]:
source_count = df["Source"].value_counts().head(4)
type_count = df["Type"].value_counts().head(3)
sentiment_count = df["Record Sentiment"].value_counts().head(3)

In [None]:
# getting the distribution
print(source_count)
print()
print(type_count)
print()
print(sentiment_count)

In [None]:
null_content_count = df["Reasons"].isnull().sum()
print(f"Number of null rows in Reasons column: {null_content_count}")

In [None]:
# analysing the protagonist further
audio_content_count = df[df['Content'] == '<AUDIO_CONTENT>'].shape[0]
null_content_count = df['Content'].isnull().sum()
print(f"Number of columns where Content column is <AUDIO_CONTENT>: {audio_content_count}")
print(f"Number of columns where Content column is Null: {null_content_count}")


In [None]:
df_summary_not_empty = df[df['Summary'].notnull()]
summary_not_empty_count = df_summary_not_empty.shape[0]
df_reasons_not_empty = df[df['Reasons'].notnull()]
reasons_not_empty_count = df_reasons_not_empty.shape[0]
print(f"Number of rows where Reasons column is not empty: {reasons_not_empty_count}")
print(f"Number of rows where Summary column is not empty: {summary_not_empty_count}")

In [None]:
# Check the 'Type' of rows where 'Reasons' column is not empty
df_reasons_not_empty_types = df_reasons_not_empty['Type'].value_counts()
print(f"Types of rows where Reasons column is not empty: \n{df_reasons_not_empty_types}")

In [None]:
# Check how many rows of type 'RecordTypeAudioRecording' have their 'Reasons' column empty
df_audio_recording_empty_reasons = df[(df['Type'] == 'RecordTypeAudioRecording') & (df['Reasons'].isnull())]
audio_recording_empty_reasons_count = df_audio_recording_empty_reasons.shape[0]
print(f"Number of rows of type 'RecordTypeAudioRecording' with empty 'Reasons' column: {audio_recording_empty_reasons_count}")

## Create a new dataframe

* For potentially finetuning a model on the data
* For providing input to the Claim Extractor

In [None]:
# as per the problem statement, select desired columns
main_df = df[
    ["ID", "URL", "Type", "Source", "CreatedAt", "Summary", "Reasons", "Content"]
]

In [None]:
from nltk.tokenize import sent_tokenize
import re
import json
def split_content_based_on_type(row):
    content = row["Content"]
    type = row["Type"]

    # Check if content is a string, not <AUDIO_CONTENT>, and not None
    if type == "RecordTypeAudioRecording":
            return "<AUDIO_CONTENT>"
    
    elif isinstance(content, str) and content is not None:    
        if type == "RecordTypeConversation":
            # Split the content based on 'User:' or 'Agent:'
            dialogues = re.split(r"(?=User:|Agent:)", content)
            # Remove leading/trailing whitespaces from each dialogue
            dialogues = [dialogue.strip() for dialogue in dialogues if dialogue != '']
            # Tokenize sentences within each dialogue and assign each sentence a key
            sentence_dict = {}
            sentence_index = 1
            for dialogue in dialogues:
                sentences = sent_tokenize(dialogue)
                for sentence in sentences:
                    sentence_dict[sentence_index] = sentence
                    sentence_index += 1
            sentence_dict = json.dumps(sentence_dict)
            return sentence_dict
        elif type == "RecordTypeSurvey":
            # Split the content into sentences
            sentences = sent_tokenize(content)
            sentences = {index+1: sentence for index, sentence in enumerate(sentences)}
            sentences = json.dumps(sentences)
            return sentences
    else:
        return ''

In [None]:
main_df["Content"] = main_df.apply(split_content_based_on_type, axis=1)

In [None]:
for i, content in enumerate(main_df['Content']):
        print(f"Content {i+1}: NumDialogues: - {content}\n")
        break

In [None]:
import json

def csv_to_json(jsonl_file_path):

    df = main_df
    # Convert each row of the DataFrame to JSON and write to a file
    with open(jsonl_file_path, "w") as f:
        for _, row in df.iterrows():
            json.dump(row.to_dict(), f)
            f.write("\n")


csv_to_json("./enterpret.jsonl")

In [None]:
# Save the DataFrame to a CSV file
main_df.to_csv("main_df.csv", index=False)

In [None]:
# Easier to work with JSON
class Content:
    def __init__(self, dict):
        self.id = dict["ID"]
        self.url = dict["URL"]
        self.type = dict["Type"]
        self.timestamp = dict["CreatedAt"]
        self.source = dict["Source"]
        self.content = dict["Content"]
        self.summary = dict["Summary"]
        self.reason = dict["Reasons"]

    @property
    def is_conversation(self):
        return self.type == "RecordTypeConversation"
    
    @property
    def is_survey(self):
        return self.type == "RecordTypeSurvey"