In [1]:
%pip install pandas nltk chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
   ---------------------------------------- 0.0/199.4 kB ? eta -:--:--
   ---------------------------------------- 199.4/199.4 kB 5.9 MB/s eta 0:00:00
Installing collected packages: chardet
Successfully installed chardet-5.2.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from nltk import ngrams
from collections import defaultdict
import chardet

In [3]:
# Having tab-separated files
answers = 'annotated_sentences/answers.tsv'
blog = 'annotated_sentences/blog.tsv'
email = 'annotated_sentences/email.tsv'
news = 'annotated_sentences/news.tsv'
file_paths = [answers, blog, email, news]
encodings = []

In [4]:
#detect encoding for each file
for file_path in file_paths:
    with open(file_path, 'rb') as file:
        encodings.append(chardet.detect(file.read())['encoding'])

In [5]:
def read_data():
    dfs = []
    i = 0
    for file_path in file_paths:
        # Read the tab-separated data using pandas
        df = pd.read_csv(file_path, sep='\t', encoding=encodings[i],header=None)
        i += 1
        # Append the DataFrame to the list
        dfs.append(df)
    # Combine all DataFrames into a single DataFrame
    return pd.concat(dfs)

In [6]:
def process(df):
    # Create dictionaries to store n-gram data
    ngram_data = {1: defaultdict(lambda: [0, 0,0]),
                  2: defaultdict(lambda: [0, 0,0]),
                  3: defaultdict(lambda: [0, 0,0]),
                  4: defaultdict(lambda: [0, 0,0]),
                  5: defaultdict(lambda: [0, 0,0])}

    # Process each row efficiently using vectorized operations
    for index, row in df.iterrows():
        score = float(row[0])
        sentence = row[3]

        for n in range(1, 6):
            ngrams_list = list(ngrams(sentence.split(), n))
            for ngram in ngrams_list:
                ngram_key = ' '.join(ngram)
                ngram_data[n][ngram_key][0] += score
                ngram_data[n][ngram_key][1] += 1
                ngram_data[n][ngram_key][2] = ngram_data[n][ngram_key][0]/ngram_data[n][ngram_key][1]

    # Write n-gram data to separate txt files
    for n in range(1, 6):
        output_file = f'./annotated_ngrams/{n}-gram.txt'
        ngram_df = pd.DataFrame.from_dict(ngram_data[n], orient='index', columns=['total_score', 'total_occurrences','average_score'])
        ngram_df.index.name = 'n-gram'
        ngram_df.reset_index(inplace=True)
        ngram_df.to_csv(output_file, sep='\t', index=False)
        print(f"Data successfully processed and written to files in {output_file}")

In [7]:
df = read_data()
process(df)

Data successfully processed and written to files in ./output/1-gram.txt
Data successfully processed and written to files in ./output/2-gram.txt
Data successfully processed and written to files in ./output/3-gram.txt
Data successfully processed and written to files in ./output/4-gram.txt
Data successfully processed and written to files in ./output/5-gram.txt
