# Master Thesis - Mattia Piazzalunga
In this notebook, the exploratory analysis and harmful noise cleaning of the English dataset is carried out.

*Title*: Bridging a GAP: Text Style Transfer from Journalistic to Conversational for enhanced social media dissemination of news

*Supervisor*: Gabriella Pasi <br>
*Author*: Mattia Piazzalunga

*University*: Bicocca University of Milan <br>
*Department*: Informatics, Systems and Communication <br>
*Course*: Computer Science <br>
*Academic year*: 2023/2024

*Info*: This notebook was run on Google Colab. Upload the whole repository before running.

*For suggestions or questions*: mattiapiazzalunga@outlook.com

## Inizializzazion

### Dowloading libraires

In [1]:
!pip install pandas matplotlib nltk textblob textstat wordcloud torch transformers

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.4


### Importing libraries

In [2]:
# Import necessary libraries
from nltk.corpus import stopwords
import nltk
from tqdm.auto import tqdm
import textstat
from nltk.tokenize import word_tokenize
from google.colab import drive
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import numpy as np
import csv
import urllib.request
import pandas as pd
import unicodedata

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
stop_words = set(stopwords.words('english'))

### Connect to Google Drive

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
default_path = "/content/drive/My Drive/MT/TM/"

### Importing the dataset

In [7]:
df = pd.read_csv(default_path+'/../corpora/J2C_news_EN.csv')

## Exploratory analysis

### First analysis

In [8]:
# Display the first 5 rows of the dataset
df.head()

Unnamed: 0,journalistic,conversational
0,A photo shows a Florida deputy having a tea pa...,A precious photo of a Florida deputy spending ...
1,Egypt's ambassador to the U.S. says the Muslim...,“My advice to the Muslim Brotherhood is they n...
2,"(Getty Images) There are more than 129,817 fed...","There are more than 129,817 federally licensed..."
3,Ferrera thanks Trump for his offensive tactics...,America Ferrera to Donald Trump: Thanks! --.
4,L'Osservatore Romano Vatican Pool/Getty Images...,This time there's no need to mourn; because th...


In [9]:
# Display the number of rows and columns of the dataset
len(df)

5352

In [10]:
# Display the number of NaN values in each column
df.isna().sum()

Unnamed: 0,0
journalistic,0
conversational,0


### Unique words by column

In [11]:
def extract_unique_words(text_series):
    unique_words = set()
    for text in text_series.dropna():
        # Remove punctuation using regex and convert to lowercase
        words = re.findall(r'\b\w+\b', text.lower())
        unique_words.update(words)
    return unique_words

# Extract unique words
unique_journalistic = extract_unique_words(df['journalistic'])
unique_conversational = extract_unique_words(df['conversational'])

# Print the results
print(f"Unique words in the 'journalistic' column: {len(unique_journalistic)}")
print(f"Unique words in the 'conversational' column: {len(unique_conversational)}")

# Total unique words
total_unique_words = unique_journalistic.union(unique_conversational)
print(f"Total unique words in the dataset: {len(total_unique_words)}")

Unique words in the 'journalistic' column: 79692
Unique words in the 'conversational' column: 16979
Total unique words in the dataset: 80024


### Text Lengths by Column

In [12]:
# Function to count words (excluding punctuation)
def count_words(text):
    tokens = word_tokenize(text)
    words = [word for word in tokens if word.isalpha()]
    return len(words)

# Calculate the length of text in each cell (number of words)
df['journalistic_length'] = df['journalistic'].apply(count_words)
df['conversational_length'] = df['conversational'].apply(count_words)

# Calculate average length per column
avg_journalistic_length = df['journalistic_length'].mean()
avg_conversational_length = df['conversational_length'].mean()

print(f"Average length in 'journalistic' column: {avg_journalistic_length:.2f} words")
print(f"Average length in 'conversational' column: {avg_conversational_length:.2f} words")

# Function to count outliers using the IQR method
def count_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers_count = series[(series < lower_bound) | (series > upper_bound)].count()
    return outliers_count

# Count outliers in 'journalistic_length'
num_journalistic_outliers = count_outliers(df['journalistic_length'])

# Count outliers in 'conversational_length'
num_conversational_outliers = count_outliers(df['conversational_length'])

print(f"Number of outliers in 'journalistic_length': {num_journalistic_outliers}")
print(f"Number of outliers in 'conversational_length': {num_conversational_outliers}")

Average length in 'journalistic' column: 539.16 words
Average length in 'conversational' column: 26.60 words
Number of outliers in 'journalistic_length': 169
Number of outliers in 'conversational_length': 248


### Find correctly written articles

In [13]:
# Count 'conversational' articles with < 50 characters
conversational_short = df[df['conversational'].str.len() < 50]
num_conversational_short = conversational_short.shape[0]
print(f"Number of 'conversational' articles with less than 50 characters: {num_conversational_short}")

# Count 'journalistic' articles with words between 2100 and 2400
df['journalistic_word_count'] = df['journalistic'].str.split().apply(len)
journalistic_in_range = df[(df['journalistic_word_count'] >= 2100) & (df['journalistic_word_count'] <= 2400)]
num_journalistic_in_range = journalistic_in_range.shape[0]
print(f"Number of 'journalistic' articles with 2100-2400 words: {num_journalistic_in_range}")

# Count the rows that satisfy both conditions
both_conditions = df[
    (df['conversational'].str.len() < 50) &
    (df['journalistic_word_count'] >= 2100) &
    (df['journalistic_word_count'] <= 2400)
]
num_both = both_conditions.shape[0]
print(f"Number of rows that satisfy both conditions: {num_both}")


Number of 'conversational' articles with less than 50 characters: 87
Number of 'journalistic' articles with 2100-2400 words: 12
Number of rows that satisfy both conditions: 0


### Sentiment Analysis Comparison Between Styles

In [14]:
# Define the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Model and tokenizer
MODEL = 'cardiffnlp/twitter-roberta-base-sentiment-latest'
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to(device)
model.eval()

# Load the labels
labels = []
mapping_link = 'https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt'
with urllib.request.urlopen(mapping_link) as f:
    csvreader = csv.reader(f.read().decode('utf-8').splitlines(), delimiter='\t')
    for row in csvreader:
        labels.append(row[1])

print("Labels:", labels)

# Preprocess function
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Custom Dataset class
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = [preprocess(text) for text in texts]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

# Function to perform batch sentiment analysis
def get_sentiment_labels(texts, batch_size=64, max_length=128):
    dataset = TextDataset(texts)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_sentiments = []

    with torch.no_grad():
        for batch_texts in tqdm(dataloader, desc="Processing batches"):
            # Tokenize the batch with explicit max_length
            inputs = tokenizer(
                batch_texts,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=max_length
            )
            # Move tensors to the device and exclude 'token_type_ids' if present
            inputs = {k: v.to(device) for k, v in inputs.items() if k != 'token_type_ids'}

            # Forward pass
            outputs = model(**inputs)
            logits = outputs.logits
            probabilities = torch.nn.functional.softmax(logits, dim=-1)

            # Get the predicted labels
            preds = torch.argmax(probabilities, dim=1).cpu().tolist()
            sentiments = [labels[pred] for pred in preds]
            all_sentiments.extend(sentiments)

    return all_sentiments


# Combine all texts to process them in batches
all_journalistic_texts = df['journalistic'].tolist()
all_conversational_texts = df['conversational'].tolist()

# Get sentiment labels for both columns
print("Processing Journalistic texts...")
journalistic_sentiments = get_sentiment_labels(all_journalistic_texts, batch_size=256, max_length=128)
print("Processing Conversational texts...")
conversational_sentiments = get_sentiment_labels(all_conversational_texts, batch_size=256, max_length=128)

# Assign the sentiment labels back to the DataFrame
df['Journalistic_Sentiment'] = journalistic_sentiments
df['Conversational_Sentiment'] = conversational_sentiments

# Count the number of 'Negative', 'Neutral', and 'Positive' sentences in each column
journalistic_counts = df['Journalistic_Sentiment'].value_counts()
conversational_counts = df['Conversational_Sentiment'].value_counts()

print("\nJournalistic Column Sentiment Counts:")
print(journalistic_counts)
print("\nConversational Column Sentiment Counts:")
print(conversational_counts)

# Function to determine the type of sentiment change between columns
def get_sentiment_change_type(journalistic, conversational):
    if journalistic != conversational:
        return f"{journalistic}_to_{conversational}"
    else:
        return 'No_Change'

# Apply the function to create a new column indicating the type of change
df['Sentiment_Change_Type'] = df.apply(
    lambda row: get_sentiment_change_type(row['Journalistic_Sentiment'], row['Conversational_Sentiment']),
    axis=1
)

# Count the number of each type of sentiment change
change_type_counts = df['Sentiment_Change_Type'].value_counts()

print("\nSentiment Change Counts:")
print(change_type_counts)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Labels: ['negative', 'neutral', 'positive']
Processing Journalistic texts...


Processing batches: 100%|██████████| 21/21 [00:46<00:00,  2.21s/it]


Processing Conversational texts...


Processing batches: 100%|██████████| 21/21 [00:27<00:00,  1.30s/it]


Journalistic Column Sentiment Counts:
Journalistic_Sentiment
neutral     3576
negative    1094
positive     682
Name: count, dtype: int64

Conversational Column Sentiment Counts:
Conversational_Sentiment
neutral     3023
negative    1635
positive     694
Name: count, dtype: int64

Sentiment Change Counts:
Sentiment_Change_Type
No_Change               3562
neutral_to_negative      816
neutral_to_positive      335
positive_to_neutral      302
negative_to_neutral      296
positive_to_negative      31
negative_to_positive      10
Name: count, dtype: int64





### Calculate readability

In [15]:
# Preprocess the Text Data
def preprocess(text):
    """
    Convert text to lowercase and remove punctuation except sentence delimiters.
    """
    text = text.lower()  # Convert to lowercase
    # Normalize text to NFKD form and encode to ASCII bytes, ignoring errors
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
    # Remove all punctuation except for sentence-ending punctuation (. ! ?)
    text = re.sub(r'[^\w\s\.\!\?]', '', text)
    return text

# Apply preprocessing to both columns
df['journalistic_clean'] = df['journalistic'].apply(preprocess)
df['conversational_clean'] = df['conversational'].apply(preprocess)

# Define Readability Function
def get_readability(text):
    return textstat.flesch_reading_ease(text) if text.strip() else np.nan

# Apply Readability and Subjectivity to Relevant Columns
for col in ['journalistic_clean', 'conversational_clean']:
    df[f'{col}_readability'] = df[col].apply(get_readability)

# Compare Metrics
comparison_results = {}

metrics = ['readability']
for metric in metrics:
    jour_col = f'journalistic_clean_{metric}'
    conv_col = f'conversational_clean_{metric}'
    comp_col = f'{metric}_comparison'

    df[comp_col] = np.where(df[conv_col] > df[jour_col], 'better',
                            np.where(df[conv_col] < df[jour_col], 'worse', 'equal'))

    # Handle NaN comparisons
    df.loc[df[jour_col].isna() | df[conv_col].isna(), comp_col] = np.nan

    # Count outcomes
    counts = df[comp_col].value_counts(dropna=True)
    comparison_results[metric.capitalize()] = counts

# Output the Counts
summary_df = pd.DataFrame(comparison_results).fillna(0).astype(int)
print("Summary of Comparisons:")
print(summary_df)

# Calculate Mean Values per Column
mean_values = {}

for metric in metrics:
    for col_type in ['journalistic_clean', 'conversational_clean']:
        col_name = f'{col_type}_{metric}'
        mean_value = df[col_name].mean()
        mean_values[f'{col_type}_{metric}'] = mean_value

# Display Mean Values
print("\nMean Values:")
for metric in metrics:
    jour_col = f'journalistic_clean_{metric}'
    conv_col = f'conversational_clean_{metric}'
    jour_mean = mean_values[jour_col]
    conv_mean = mean_values[conv_col]
    print(f"{metric.capitalize()} - Journalistic Mean: {jour_mean:.2f}, Conversational Mean: {conv_mean:.2f}")

Summary of Comparisons:
                        Readability
readability_comparison             
better                         2904
worse                          2439
equal                             9

Mean Values:
Readability - Journalistic Mean: 60.44, Conversational Mean: 62.14


### Calculate subjetivity

In [16]:
# Load the subjectivity detection model and tokenizer
model_name = 'GroNLP/mdebertav3-subjectivity-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()  # Set the model to evaluation mode

# Retrieve the label mapping
label_mapping = model.config.id2label
print("Label mapping:", label_mapping)

def batch_predict(texts, batch_size=64):
    """
    Predict subjectivity labels for a list of texts in batches.

    Args:
        texts (list): List of text strings to classify.
        batch_size (int): Number of texts to process in each batch.

    Returns:
        list: List of predicted labels.
    """
    predicted_labels = []

    # Process texts in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing batches"):
        batch_texts = texts[i:i + batch_size]

        # Tokenize the batch of texts
        inputs = tokenizer(
            batch_texts,
            return_tensors='pt',
            padding=True,        # Pad sequences to the same length within the batch
            truncation=True,     # Truncate sequences longer than the model's maximum input size
            max_length=128
        )

        # Move inputs to the appropriate device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        # Get predicted label ids
        predicted_label_ids = outputs.logits.argmax(dim=1)

        # Map label ids to label names
        batch_labels = [model.config.id2label[label_id.item()] for label_id in predicted_label_ids]

        predicted_labels.extend(batch_labels)

    return predicted_labels

# Function to process a column
def process_column(df, column_name):
    """
    Process a column in the dataframe to predict subjectivity labels.

    Args:
        df (pd.DataFrame): DataFrame containing the data.
        column_name (str): Name of the column to process.

    Returns:
        pd.DataFrame: DataFrame with a new column for subjectivity labels.
    """
    print(f"Processing '{column_name}' column...")
    texts = df[column_name].astype(str).tolist()
    subjectivity_labels = batch_predict(texts, batch_size=256)
    df[f'{column_name}_Subjectivity'] = subjectivity_labels
    counts = df[f'{column_name}_Subjectivity'].value_counts()
    print(f"\n{column_name} Column Subjectivity Counts:")
    print(counts)
    return df

# Process 'journalistic' and 'conversational' columns
df = process_column(df, 'journalistic')
df = process_column(df, 'conversational')

# Function to determine the type of subjectivity change between columns
def get_subjectivity_change_type(row):
    if row['journalistic_Subjectivity'] != row['conversational_Subjectivity']:
        return f"{row['journalistic_Subjectivity']}_to_{row['conversational_Subjectivity']}"
    else:
        return 'No_Change'

# Apply the function to create a new column indicating the type of change
df['Subjectivity_Change_Type'] = df.apply(get_subjectivity_change_type, axis=1)

# Count the number of each type of subjectivity change
change_type_counts = df['Subjectivity_Change_Type'].value_counts()
print("\nSubjectivity Change Counts:")
print(change_type_counts)

#LABEL_0: objective, LABEL_1: subjective

tokenizer_config.json:   0%|          | 0.00/412 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/881 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Label mapping: {0: 'LABEL_0', 1: 'LABEL_1'}
Processing 'journalistic' column...


Processing batches: 100%|██████████| 21/21 [00:56<00:00,  2.69s/it]



journalistic Column Subjectivity Counts:
journalistic_Subjectivity
LABEL_0    4866
LABEL_1     486
Name: count, dtype: int64
Processing 'conversational' column...


Processing batches: 100%|██████████| 21/21 [00:41<00:00,  1.96s/it]


conversational Column Subjectivity Counts:
conversational_Subjectivity
LABEL_0    4646
LABEL_1     706
Name: count, dtype: int64

Subjectivity Change Counts:
Subjectivity_Change_Type
No_Change             4534
LABEL_0_to_LABEL_1     519
LABEL_1_to_LABEL_0     299
Name: count, dtype: int64





### Calculate formality

In [17]:
# Check if CUDA (GPU) is available and set the device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Specify the model name
model_name = 's-nlp/roberta-base-formality-ranker'

# Load the tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the model and move it to the specified device
print("Loading model...")
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.to(device)
model.eval()  # Set the model to evaluation mode

def batch_classify_formality(texts, batch_size=64):
    """
    Classify a list of texts as 'Formal' or 'Informal' in batches.

    Args:
        texts (list of str): The texts to classify.
        batch_size (int): The number of texts to process in each batch.

    Returns:
        list of str: The classification labels ('Formal' or 'Informal') for each text.
    """
    labels = []
    num_batches = (len(texts) + batch_size - 1) // batch_size  # Ceiling division

    for i in tqdm(range(num_batches), desc="Classifying Batches"):
        batch_texts = texts[i*batch_size : (i+1)*batch_size]
        # Tokenize the batch
        inputs = tokenizer(
            batch_texts,
            return_tensors='pt',
            truncation=True,
            padding=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits  # Shape: [batch_size, 2]
        probabilities = torch.nn.functional.softmax(logits, dim=-1)
        formal_probs = probabilities[:, 1].cpu().numpy()  # Index 1 corresponds to 'Formal'
        batch_labels = ['Formal' if prob >= 0.5 else 'Informal' for prob in formal_probs]
        labels.extend(batch_labels)

    return labels

# Extract texts from the DataFrame columns
journalistic_texts = df['journalistic'].tolist()
conversational_texts = df['conversational'].tolist()

# Classify 'journalistic' texts
print("\nClassifying 'Journalistic' Column...")
df['Journalistic_Formality'] = batch_classify_formality(journalistic_texts, batch_size=256)

# Classify 'conversational' texts
print("\nClassifying 'Conversational' Column...")
df['Conversational_Formality'] = batch_classify_formality(conversational_texts, batch_size=256)

# Count the number of 'Formal' and 'Informal' sentences in each column
journalistic_counts = df['Journalistic_Formality'].value_counts()
conversational_counts = df['Conversational_Formality'].value_counts()

print("\nJournalistic Column Formality Counts:")
print(journalistic_counts)

print("\nConversational Column Formality Counts:")
print(conversational_counts)

def get_formality_change_type(row):
    """
    Determine the type of formality change between 'Journalistic' and 'Conversational' columns.

    Args:
        row (pd.Series): A row from the DataFrame.

    Returns:
        str: The type of formality change.
    """
    if row['Journalistic_Formality'] == 'Formal' and row['Conversational_Formality'] == 'Informal':
        return 'Formal_to_Informal'
    elif row['Journalistic_Formality'] == 'Informal' and row['Conversational_Formality'] == 'Formal':
        return 'Informal_to_Formal'
    else:
        return 'No_Change'

# Apply the change type function to each row
df['Formality_Change_Type'] = df.apply(get_formality_change_type, axis=1)

# Count the number of each type of formality change
change_type_counts = df['Formality_Change_Type'].value_counts()

print("\nFormality Change Counts:")
print(change_type_counts)

Using device: cuda
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



Loading model...


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]


Classifying 'Journalistic' Column...


Classifying Batches: 100%|██████████| 21/21 [02:53<00:00,  8.25s/it]



Classifying 'Conversational' Column...


Classifying Batches: 100%|██████████| 21/21 [00:31<00:00,  1.48s/it]


Journalistic Column Formality Counts:
Journalistic_Formality
Formal      5125
Informal     227
Name: count, dtype: int64

Conversational Column Formality Counts:
Conversational_Formality
Formal      4679
Informal     673
Name: count, dtype: int64

Formality Change Counts:
Formality_Change_Type
No_Change             4658
Formal_to_Informal     570
Informal_to_Formal     124
Name: count, dtype: int64



