In [1]:
import pandas as pd
translated_dataset = pd.read_csv("/Users/miraslavats/Library/CloudStorage/GoogleDrive-miraslava@uni.minerva.edu/My Drive/Capstone/translated_1350_entries.csv")

In [2]:
translated_dataset.head()

Unnamed: 0,Index,Title,Links,Date Published,Transcriptions,translated_Transcriptions
0,0,1 сентября _ Готовность к учебному году _ Ремо...,https://rutube.ru/video/6ecd6c4bc6753e4ba8d8f3...,2024-08-29T22:30:07,"Кто хотела украсть 1 сентября? Представьте, э...","Who wanted to steal September 1st? Imagine, th..."
1,1,Белорус Игорь Бокий завоевал первую медаль на ...,https://rutube.ru/video/12b0d54c8ebdffe9226b82...,2024-08-29T22:24:20,Победные новости только что пришли из Парижа....,Winning news has just come from Paris. The mai...
2,6,В Польшу с украинской стороны залетел беспилот...,https://rutube.ru/video/b364060bfc71d6bc19a372...,2024-08-29T18:22:35,"Польше находятся в состоянии гибридной войны,...","Poland is in a state of hybrid war, this was o..."
3,10,Беларусь изучают как будущий театр военных дей...,https://rutube.ru/video/cfa70cd9af6b25beddef83...,2024-08-28T21:56:19,"Вопереди спозиция, это Гагарин поехали. Балти...","Ahead lies the army exercise, it’s time for Ga..."
4,20,Жители Польши в шоке! Представители партии PIS...,https://rutube.ru/video/877d4f511b1cc4da1417ea...,2024-08-25T21:53:02,ВОЗГЛАЯ МУЗЫКА Вот уже несколько дней прибыва...,"MUSIC OF THE CALL\n\nFor several days now, I h..."


## Data Cleaning

In [3]:
import string
translated_dataset['translated_Transcriptions'] = translated_dataset['translated_Transcriptions'].str.lower()
translated_dataset['translated_Transcriptions'] = translated_dataset['translated_Transcriptions'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [4]:
import re

def remove_gibberish(text):
    # Remove words with non-Latin characters
    text = re.sub(r'\b[^\w\s]*[^\w\s]+\b', '', text)  # Removes gibberish-like strings
    # Remove isolated special characters or symbols
    text = re.sub(r'[^\w\s]', '', text)  # Retain only alphanumeric and space
    return text

translated_dataset['translated_Transcriptions'] = translated_dataset['translated_Transcriptions'].apply(remove_gibberish)

In [5]:
def remove_non_latin(text):
    # Keep only Latin characters and spaces, remove others
    return re.sub(r'[^\x00-\x7F]+', '', text)  # Matches non-ASCII characters

translated_dataset['translated_Transcriptions'] = translated_dataset['translated_Transcriptions'].apply(remove_non_latin)

In [6]:
translated_dataset.head()

Unnamed: 0,Index,Title,Links,Date Published,Transcriptions,translated_Transcriptions
0,0,1 сентября _ Готовность к учебному году _ Ремо...,https://rutube.ru/video/6ecd6c4bc6753e4ba8d8f3...,2024-08-29T22:30:07,"Кто хотела украсть 1 сентября? Представьте, э...",who wanted to steal september 1st imagine this...
1,1,Белорус Игорь Бокий завоевал первую медаль на ...,https://rutube.ru/video/12b0d54c8ebdffe9226b82...,2024-08-29T22:24:20,Победные новости только что пришли из Парижа....,winning news has just come from paris the main...
2,6,В Польшу с украинской стороны залетел беспилот...,https://rutube.ru/video/b364060bfc71d6bc19a372...,2024-08-29T18:22:35,"Польше находятся в состоянии гибридной войны,...",poland is in a state of hybrid war this was op...
3,10,Беларусь изучают как будущий театр военных дей...,https://rutube.ru/video/cfa70cd9af6b25beddef83...,2024-08-28T21:56:19,"Вопереди спозиция, это Гагарин поехали. Балти...",ahead lies the army exercise its time for gaga...
4,20,Жители Польши в шоке! Представители партии PIS...,https://rutube.ru/video/877d4f511b1cc4da1417ea...,2024-08-25T21:53:02,ВОЗГЛАЯ МУЗЫКА Вот уже несколько дней прибыва...,music of the call\n\nfor several days now i ha...


In [9]:
trial = translated_subset[:20]

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Set a manual seed for reproducibility in PyTorch
torch.manual_seed(42)

# Define the model name. GPT-2 is a common choice for causal language modeling.
model_name = "gpt2"

# Load the tokenizer and model from the Hugging Face Transformers library.
# The tokenizer handles converting text into token IDs compatible with the model.
tokenizer = AutoTokenizer.from_pretrained(model_name)

# The model is a pre-trained GPT-2 model used for causal language modeling.
model = AutoModelForCausalLM.from_pretrained(model_name)

def calculate_perplexity_first_1024(text):
    """
    Calculate the perplexity of the given text using the first 1024 tokens.

    Perplexity is a measure of how well a language model predicts a sequence of tokens. 
    Lower perplexity indicates better performance.

    Args:
        text (str): Input text for which perplexity is calculated.

    Returns:
        float: The perplexity of the input text.

    Note:
        - The input text is truncated to 1024 tokens to fit within the model's maximum context length.
        - Perplexity is computed using the model's loss function.
    """
    # Tokenize the input text. Truncate to 1024 tokens to ensure compatibility with GPT-2's context size.
    tokens = tokenizer(text, return_tensors="pt", truncation=True, max_length=1024)["input_ids"]
    if not text.strip():
        raise ValueError("Input text cannot be empty or whitespace.")

    # Disable gradient calculations for inference (no backpropagation needed).
    with torch.no_grad():
        # Pass the tokens through the model with the labels set to the input tokens.
        # This allows the model to compute the loss directly.
        outputs = model(input_ids=tokens, labels=tokens)
        loss = outputs.loss  # Cross-entropy loss for the given sequence.

        # Convert the loss into perplexity using the exponential function.
        perplexity = torch.exp(loss)

    # Return the perplexity as a standard Python float.
    return perplexity.item()

translated_dataset['perplexity'] = translated_dataset['translated_Transcriptions'].apply(calculate_perplexity_first_1024)

In [8]:
# Test cases for calculate_perplexity_first_1024 function
test_cases = [
    "This is a simple test sentence.",
    "a" * 2000,  # Long text, gets truncated
    "",  # Empty string
    "abcd efgh ijkl mno pqrs tuv wxyz",  # Random characters
    "The quick brown fox jumps over the lazy dog." * 50,  # Repetitive text
]

# Run the test cases
for idx, text in enumerate(test_cases):
    print(f"Test Case {idx + 1}:")
    try:
        result = calculate_perplexity_first_1024(text)
        print(f"Perplexity: {result}\n")
    except Exception as e:
        print(f"Error: {e}\n")

Test Case 1:
Perplexity: 65.2133560180664

Test Case 2:
Perplexity: 1.0314496755599976

Test Case 3:
Error: Input text cannot be empty or whitespace.

Test Case 4:
Perplexity: 385.2892150878906

Test Case 5:
Perplexity: 1.1323281526565552



In [5]:
import time

text = "This is a test sentence." * 1000
start_time = time.time()
perplexity = calculate_perplexity_first_1024(text)
end_time = time.time()
print(f"Execution time: {end_time - start_time:.4f} seconds")

Execution time: 0.8304 seconds


In [8]:
translated_dataset.tail(10)

Unnamed: 0,Index,Title,Links,Date Published,Transcriptions,translated_Transcriptions,perplexity
1344,1725,Итоги встречи Лукашенко и Путина в Москве. Как...,https://rutube.ru/video/ba873ccd26bbe58c202928...,2023-04-09T21:23:33,"Итак, все неделю график нашего президента был...",so all week our presidents schedule was dedica...,62.896767
1345,1732,К разделу Украины готовы все западные соседи —...,https://rutube.ru/video/94675b48575cc0e78c973d...,2024-09-20T11:03:54,Словом будущей Украины предрешено миллиардные...,the future of ukraine is predetermined by bill...,66.977562
1346,1733,Подготовка региональной группировки войск Бела...,https://rutube.ru/video/2a99387dca3740e42d890d...,2023-04-06T21:35:22,"Не только экономика, вопрос о безопасности со...",not only the economy but also the issue of sec...,77.948441
1347,1734,Переговоры Лукашенко и Путина. Заседание Высше...,https://rutube.ru/video/97e0549cb9ad47b36b1d86...,2023-04-06T21:25:31,Заседание высшего госоветца и узного государс...,the meeting of the highest state council and t...,70.815437
1348,1735,По следам послания Президента - безработица в ...,https://rutube.ru/video/1c109a356ae470ac3946bf...,2023-04-05T22:46:37,Добро вечер. Это экономическая среда на белор...,good evening this is the economic environment ...,64.458542
1349,1736,Под видом борьбы за свободу — радикальные идеи...,https://rutube.ru/video/0e0804d6a9921170880611...,2023-04-05T21:56:18,Важное место в борьбе с экстремизмом и террор...,an important role in the fight against extremi...,76.320084
1350,1737,Лукашенко на переговорах с Путиным - мы все пр...,https://rutube.ru/video/4f957ac2064fd669691059...,2024-09-20T09:54:53,Начало рабочего визита Александра Лукашенко в...,the beginning of alexander lukashenkos working...,66.183456
1351,1740,"Лукашенко рассказал, как вместе работают разве...",https://rutube.ru/video/86a9ba759090de773bcd86...,2024-09-24T02:52:44,Угрозы для союзного государства и не только о...,threats to the union state and beyond were dis...,81.858147
1352,1742,"Мир в Украине, Хельсинки-2, Сербия - Западу ну...",https://rutube.ru/video/71498d59f79da0561426f8...,2023-04-03T22:01:38,Если набрать в интернете мирные инициативы Бе...,if you enter peaceful initiatives of belarus i...,84.036766
1353,1744,Регулярное авиасообщение между Минском и регио...,https://rutube.ru/video/2dab3850ed55c5620b998c...,2023-04-03T21:44:13,Беларуси и колорусской области расширяют сотр...,belarus and the kaluga region are expanding th...,65.831787


In [9]:
translated_dataset.to_csv("/Users/miraslavats/Library/CloudStorage/GoogleDrive-miraslava@uni.minerva.edu/My Drive/Capstone/translated_perplexity_all_1300", index = False)

In [None]:
# Display the first few rows with perplexity
print(data[['translated_Transcriptions', 'perplexity']].head())

## Average Perplexity

In [13]:
import pandas as pd

# Assuming your dataframe is called df
# Convert 'Date Published' to datetime if it's not already
translated_dataset['Date Published'] = pd.to_datetime(translated_dataset['Date Published'])

# Create a "bucket" column representing the year-month combination
translated_dataset['Bucket'] = translated_dataset['Date Published'].dt.strftime('%B %Y')

# Group by the 'Bucket' column and calculate the average perplexity for each bucket
bucket_avg_perplexity = translated_dataset.groupby('Bucket')['perplexity'].mean().reset_index()

# Rename columns for clarity
bucket_avg_perplexity.columns = ['Bucket', 'Average Perplexity']

In [16]:
bucket_avg_perplexity.to_csv("/Users/miraslavats/Library/CloudStorage/GoogleDrive-miraslava@uni.minerva.edu/My Drive/Capstone/average_perplexity_per_group", index = False)

## Readability scores

I need to use the "uncleaned" transcriptions for these scores because a lot of them rely on the number of words in sentences, which is impossible to accurately calculate in my "cleaned" data because I removed all the punctuation.

In [23]:
translated_dataset = pd.read_csv("/Users/miraslavats/Library/CloudStorage/GoogleDrive-miraslava@uni.minerva.edu/My Drive/Capstone/translated_1350_entries.csv")

In [24]:
import pandas as pd
import textstat

# Sample DataFrame
data = {
    "News Transcripts": [
        "The government has announced new policies to address the economic downturn.",
        "In a surprising turn of events, the opposition party won the election by a narrow margin.",
        "New scientific studies reveal groundbreaking insights into climate change."
    ]
}
df = pd.DataFrame(data)

# Function to calculate readability scores
def calculate_readability_scores(text):
    return {
        "Flesch Reading Ease": textstat.flesch_reading_ease(text),
        "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(text),
        "Gunning Fog Index": textstat.gunning_fog(text)
    }

# Apply the function to each transcript
translated_dataset["Readability Scores"] = translated_dataset["translated_Transcriptions"].apply(calculate_readability_scores)

# If you want separate columns for each score
bel_media_scores = pd.concat([translated_dataset.drop(columns=["Readability Scores"]), translated_dataset["Readability Scores"].apply(pd.Series)], axis=1)

In [29]:
bel_media_scores.iloc[0]['Links']

'https://rutube.ru/video/6ecd6c4bc6753e4ba8d8f30e94df249b/'

### Average Readability Scores

In [26]:
bel_media_scores['Date Published'] = pd.to_datetime(bel_media_scores['Date Published'])

# Create a "bucket" column representing the year-month combination
bel_media_scores['Bucket'] = bel_media_scores['Date Published'].dt.strftime('%B %Y')

# Group by the 'Bucket' column and calculate the average for each bucket
bucket_avg_readability = bel_media_scores.groupby('Bucket')[
    ['Flesch Reading Ease', 'Flesch-Kincaid Grade Level', 'Gunning Fog Index']
].mean().reset_index()

# Rename columns for clarity (optional, but improves readability)
bucket_avg_readability.columns = ['Bucket', 'Avg Flesch Reading Ease', 'Avg Flesch-Kincaid Grade Level', 'Avg Gunning Fog Index']

In [28]:
bucket_avg_readability.to_csv("/Users/miraslavats/Library/CloudStorage/GoogleDrive-miraslava@uni.minerva.edu/My Drive/Capstone/bel_average_readability_per_group", index = False)