## 0. Imports Preliminaries

In [1]:
#0. Preliminaries
import pandas as pd
import nltk
nltk.download('punkt')
from readability import Readability

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
import string
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maria\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [79]:
from transformers import pipeline
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("lxyuan/distilbert-base-multilingual-cased-sentiments-student", max_length = 512, truncation = True)

distilled_student_sentiment_classifier = pipeline(
    model="lxyuan/distilbert-base-multilingual-cased-sentiments-student",
    tokenizer = tokenizer,
    return_all_scores=True
)



In [82]:
tokenizer_kwargs = {'truncation':True,'max_length':512}
distilled_student_sentiment_classifier(df['Chunk'][5], **tokenizer_kwargs)

[[{'label': 'positive', 'score': 0.5484877824783325},
  {'label': 'neutral', 'score': 0.21492399275302887},
  {'label': 'negative', 'score': 0.23658819496631622}]]

## 1. Load and Split Data

In [41]:
#1. Import Data
df = pd.read_csv("../data/chunked_author_data.csv")

In [42]:
df_train_val, df_test = train_test_split(df, train_size = 0.8, random_state = 42)
df_train, df_val = train_test_split(df_train_val, train_size = 0.8, random_state = 42)

In [52]:
results_senti = distilled_student_sentiment_classifier(df.loc[0, 'Chunk'])

In [59]:
[x['score'] for x in results_senti[0] if x['label'] == 'positive']

[0.501285195350647]

## 2. Add Features

In [50]:
#Flesch 
def flesch_readability_scale(text):
    try:
      r = Readability(text)
      f = r.flesch()
      score_out = f.score
    #not possible if less than 100 words
    except:
      score_out = np.nan
      
    return score_out

In [83]:
# Sentiment Analysis (Positive Score)
tokenizer_kwargs = {'truncation':True,'max_length':512}
def sentiment_analysis_score(text):
    results_senti = distilled_student_sentiment_classifier(text, **tokenizer_kwargs)
    positive_score = [x['score'] for x in results_senti[0] if x['label'] == 'positive']
    score_out = positive_score[0] if len(positive_score) == 1 else np.nan
    return score_out                          

In [9]:
#punctuation n-grams
punct_vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize, use_idf=False, norm='l1', vocabulary=string.punctuation)

In [10]:
stopword_vectorizer = CountVectorizer(ngram_range=(1, 3), tokenizer=nltk.word_tokenize, vocabulary=stopwords.words("english"))

In [None]:
#remove proper nouns and POS-tag n grams
proper_nouns = 

## 3. Apply Features to Datasets

In [11]:
def preprocess_data(df, text_col, train = False):
    #these should be run first before cleaning punctuation and private words and stuff
    df['flesch_score'] = df[text_col].apply(flesch_readability_scale)
    df['sent_score'] = df[text_col].apply(sentiment_analysis_score)
    #train has to be run first - a catch statement for that
    try:
        punct_features = punct_vectorizer.fit_transform(df[text_col]) if train else punct_vectorizer.transform(df)
        punct_features_df = pd.DataFrame(punct_features.toarray(), columns=punct_vectorizer.get_feature_names_out())
        df = pd.concat([df, punct_features_df])
    except Exception as e:
        print(f"Error In Generating Punctuation N-grams: {e}")
    try:
        stopwords_features = stopword_vectorizer.fit_transform(df[text_col]) if train else stopword_vectorizer.transform(df)
        stopwords_features_df = pd.DataFrame(stopwords_features.toarray(), columns=stopword_vectorizer.get_feature_names_out())
        df = pd.concat([df, stopwords_features_df])
    except Exception as e:
        print(f"Error In Generating Stopword N-grams: {e}")

In [84]:
df_processed = preprocess_data(df_train, 'Chunk', train = True)