# Libraries

In [21]:
# IO
import os
import csv
import pathlib
from pathlib import Path
import chardet

# Utilities
import numpy as np 
import pandas as pd



# Classes

# Functions

In [19]:
def extract_sentences_from_file(filepath):
    """Given an input txt file, extract sentences 
    and associated sentiments

    Args:
        filepath (string): path to string 

    Returns:
        list: list of sentences and sentiments
    """
    sentences = []
    # automatically detect endoding
    with open(filepath, 'rb') as file:
        encoding = chardet.detect(file.read())['encoding']
    # read and split
    with open(filepath, 'r', encoding=encoding) as file:
        for line in file:
            line = line.strip()
            if '.@' in line:
                sentence, sentiment = line.rsplit('.@', 1)
                sentence = sentence.strip()
                sentiment = sentiment.strip().lower()
                sentences.append((sentence, sentiment))
    return sentences

In [4]:
def process_directory(directory_path, output_csv):
    """Takes a directory path as input and generates a .csv file
    which contains sentences and sentiments as columns, extracted
    from .txt files inside the directory

    Args:
        directory_path (string): path to directory containing 
        .txt files
        output_csv (string): path to desired output .csv file
    """
    # Skip processing if CSV already exists
    if os.path.exists(output_csv):
        print(f"{output_csv} already exists. Skipping processing.")
        return

    all_sentences = []

    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory_path, filename)
            all_sentences.extend(extract_sentences_from_file(filepath))

    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Sentence', 'Sentiment'])  # Header
        writer.writerows(all_sentences)
    print(f"Processed files and wrote output to {output_csv}")


In [35]:
def convert_to_utf8(input_path, output_path):
    """Converts files into utf8 encoding

    Args:
        input_path (string): input .txt file path
        output_path (string): output .txt file path
    """
    with open(input_path, 'r', encoding='iso-8859-1') as source_file:
        content = source_file.read()
    with open(output_path, 'w', encoding='utf-8') as target_file:
        target_file.write(content)
    print(f"Converted '{input_path}' to UTF-8 and saved as '{output_path}'")

# Models

# Main

## Import data

OS-agnostic working folder and data folder definition

In [5]:
CodeDirectory = Path(os.path.abspath(''))
DATASET_FOLDER = os.path.join(str(CodeDirectory.parent.absolute()), "_data","")

In [6]:
raw_data_dir = DATASET_FOLDER+'FinancialPhraseBank-v1.0'
data_csv_path = DATASET_FOLDER+'data.csv'

Import raw data and convert into csv format

In [20]:
process_directory(raw_data_dir, data_csv_path)

Processed files and wrote output to /Users/exterior/Documents/IML/Project/Part2/_data/data.csv


In [32]:
with open(raw_data_dir+'/Sentences_50Agree.txt', 'rb') as file:
        print(chardet.detect(file.read())['encoding'])

ISO-8859-1


In [36]:
convert_to_utf8(raw_data_dir+'/Sentences_50Agree.txt',raw_data_dir+'/Sentences_50Agree_utf8.txt')

Converted '/Users/exterior/Documents/IML/Project/Part2/_data/FinancialPhraseBank-v1.0/Sentences_50Agree.txt' to UTF-8 and saved as '/Users/exterior/Documents/IML/Project/Part2/_data/FinancialPhraseBank-v1.0/Sentences_50Agree_utf8.txt'


Import data into pandas dataframe

In [22]:
df = pd.read_csv(data_csv_path)

In [25]:
df.head(10)

Unnamed: 0,Sentence,Sentiment
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,With the new production plant the company woul...,positive
3,According to the company 's updated strategy f...,positive
4,"For the last quarter of 2010 , Componenta 's n...",positive
5,"In the third quarter of 2010 , net sales incre...",positive
6,Operating profit rose to EUR 13.1 mn from EUR ...,positive
7,"Operating profit totalled EUR 21.1 mn , up fro...",positive
8,TeliaSonera TLSN said the offer is in line wit...,positive
9,"STORA ENSO , NORSKE SKOG , M-REAL , UPM-KYMMEN...",positive


In [34]:
print(df.loc[12]["Sentence"])

Clothing retail chain Sepp+ñl+ñ 's sales increased by 8 % to EUR 155.2 mn , and operating profit rose to EUR 31.1 mn from EUR 17.1 mn in 2004
