In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re

#Data pre-processing
def clean_text(text):
    #Remove HTML tags using BeautifulSoup
    text_no_html = BeautifulSoup(text, "html.parser").get_text()

    #Remove URLs using regular expressions
    text_no_urls = re.sub(r'http\S+|www\S+|https\S+', '', text_no_html, flags=re.MULTILINE)

    #Remove unwanted sequences
    text_cleaned = re.sub(r'=\S+', '', text_no_urls)
    text_cleaned = re.sub(r'=', '', text_cleaned)

    return text_cleaned

df = pd.read_csv('rejections.csv')

# Apply the cleaning function to the "Text" column
df['Text'] = df['Text'].apply(clean_text)

# Save the cleaned text into a new CSV file
df.to_csv('cleaned_text_filtered.csv', index=False)

print("Text data has been successfully cleaned and saved to cleaned_text_filtered.csv")

Text data has been successfully cleaned and saved to cleaned_text_filtered.csv


In [None]:
import nltk
nltk.download('punkt')

import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

file_path = 'cleaned_text_filtered.csv'
df = pd.read_csv(file_path)

if 'ID' not in df.columns:
    df['ID'] = df.index

tokenized_data = []

#Sentence tokenizer
for index, row in df.iterrows():
    text_id = row['ID']
    text = row['Text']

    sentences = sent_tokenize(text)

    #Clean and append each tokenized sentence with its ID to the list
    for sentence in sentences:
        clean_sentence = sentence.replace('\n', ' ').strip()  #Remove new lines and extra spaces
        tokenized_data.append({'ID': text_id, 'Sentence': clean_sentence})

#Create a new DataFrame from the tokenized data
tokenized_df = pd.DataFrame(tokenized_data)

# Save the tokenized sentences to a new CSV file
output_file_path = 'tokenized_output_file.csv'  # Specify the output file path
tokenized_df.to_csv(output_file_path, index=False)

# Display the first few rows of the tokenized DataFrame
print(tokenized_df.head())

   ID                                           Sentence
0   0  Ref: 110127BR - 2018 Data Scientist Internship...
1   0  With reference to the application for\r the ab...
2   0  In keeping with our company policies, your app...
3   0  We highly encourage you to consider other oppo...
4   0  Sincerely,\r \r IBM Early Professional Recruit...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import pandas as pd
import csv

def clean_csv(input_file_path, cleaned_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(cleaned_file_path, 'w', encoding='utf-8') as outfile:
        for line in infile:
            if line.strip():  # Only write non-empty lines
                outfile.write(line)

def load_csv(file_path):
    try:
        return pd.read_csv(file_path, on_bad_lines='warn')
    except pd.errors.EmptyDataError:
        print("The file is empty or has no valid data.")
    except pd.errors.ParserError as e:
        print(f"Error parsing CSV: {e}")
        print("Attempting to clean the CSV file...")
        cleaned_file_path = 'cleaned_file.csv'
        clean_csv(file_path, cleaned_file_path)
        print(f"Cleaned file created at: {cleaned_file_path}")

        return pd.read_csv(cleaned_file_path, on_bad_lines='warn')
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def concatenate_contents(df):
    #Concatenate the contents based on matching IDs
    if 'ID' in df.columns and 'Sentence' in df.columns:
        #Convert contents to strings and fill NaN values with empty strings
        df['Sentence'] = df['Sentence'].astype(str).fillna('')

        #Concatenate contents based on IDs
        concatenated_df = df.groupby('ID', as_index=False)['Sentence'].agg(' '.join)
        return concatenated_df
    else:
        print("The DataFrame does not contain the required columns 'ID' and 'Sentence'.")
        return None

input_file_path = 'tokenized_output_file.csv'  # Replace with your CSV file path
output_file_path = 'concatenated_output_file.csv'  # Specify the output file path

# Load the CSV file
df = load_csv(input_file_path)

if df is not None:
    # Rename columns if necessary
    df.columns = ['ID', 'Sentence']  # Adjust based on actual column names

    # Concatenate contents based on IDs
    result_df = concatenate_contents(df)

    if result_df is not None:
        # Save the concatenated DataFrame to a new CSV file
        result_df.to_csv(output_file_path, index=False)
        print(f"Concatenated contents saved to: {output_file_path}")


In [None]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis")

def analyze_sentiment(text):
    # Split text into chunks of 512 tokens or less
    max_length = 512
    words = text.split()  # Split the text into words
    chunks = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]

    # Analyze each chunk and combine the results
    results = sentiment_analysis(chunks, truncation = True)  # BERT-based model sadly only takes 512 as input. Cut the input file in half if needed
    # Aggregate results, e.g., using majority voting
    sentiments = [result['label'] for result in results]

    # Return the most common sentiment
    return max(set(sentiments), key=sentiments.count)

# Function to classify the relationship between two sentiments
def classify_relationship(sent1, sent2):
    if sent1 == 'POSITIVE' and sent2 == 'NEGATIVE':
        return "Polite Rejection"
    elif sent1 == 'NEGATIVE' and sent2 == 'POSITIVE':
        return "Encouragement After Rejection"
    elif sent1 == 'NEGATIVE' and sent2 == 'NEGATIVE':
        return "Justification of Rejection"
    elif sent1 == 'POSITIVE' and sent2 == 'POSITIVE':
        return "Neutral or Supportive"
    else:
        return "Other"  # In case the semantic analysis also provides NEUTRAL or other labels

# Apply the custom sentiment analysis function on the "Text" column
df['Sentiment'] = df['Sentence'].apply(analyze_sentiment)

# Analyze transitions between consecutive sentences
transitions = []
for i in range(len(df) - 1):
    relationship = classify_relationship(df['Sentiment'].iloc[i], df['Sentiment'].iloc[i + 1])
    transitions.append(relationship)
transitions.append(None)  # The last sentence has no transition
df['Transition'] = transitions
# Save results to a new CSV file
output_file_path = 'tokenized_output_file_Final.csv'  # Specify the output file path
df.to_csv(output_file_path, index=False)

In [None]:
# Display the results
print(df[['Sentence', 'Sentiment', 'Transition']])

                                              Sentence Sentiment  \
0    Ref: 110127BR - 2018 Data Scientist Internship...  POSITIVE   
1    With reference to the application for\n the ab...  NEGATIVE   
2    In keeping with our company policies, your app...  NEGATIVE   
3    We highly encourage you to consider other oppo...  POSITIVE   
4                                                  nan  POSITIVE   
..                                                 ...       ...   
809  Hi Conor,\n Thank you for your interest in Air...  POSITIVE   
810  At this point, we've decided to move forward w...  POSITIVE   
811  We will keep your application details on recor...  NEGATIVE   
812  Thanks again for your interest in Airware and ...  POSITIVE   
813                        Regards,\n The Airware Team  POSITIVE   

                        Transition  
0                 Polite Rejection  
1       Justification of Rejection  
2    Encouragement After Rejection  
3            Neutral or Supportive 