### Imports

In [16]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

### Load the CSVs

In [7]:
# Load the CSVs
df_main = pd.read_csv('data/ted_main.csv')
df_transcripts = pd.read_csv('data/transcripts.csv')

# Quick overview
print(f"Main has {df_main.shape[0]} rows and {df_main.shape[1]} columns.")
print(f"Transcripts has {df_transcripts.shape[0]} rows and {df_transcripts.shape[1]} columns.")

Main has 2550 rows and 17 columns.
Transcripts has 2467 rows and 2 columns.


### Merge

In [8]:
# We join using the 'url' column present in both files
df = pd.merge(left=df_main, right=df_transcripts, on='url', how='inner')

# Verification
print(f"The final dataset has {df.shape[0]} rows.")
df.head(2)

The final dataset has 2467 rows.


Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views,transcript
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110,Good morning. How are you?(Laughter)It's been ...
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520,"Thank you so much, Chris. And it's truly a gre..."


### Text Cleaning Pipeline

In [9]:
# Download necessary NLTK data (only need to run this once)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nicolas/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/nicolas/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
# Define the cleaning function
stop_words = set(stopwords.words('english'))

In [13]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and special characters
    text = re.sub(r'[^a-z\s]', ' ', text)

    # Remove 'Stop Words' (common words like 'the', 'is', 'in')
    words = text.split()
    cleaned_words = [word for word in words if word not in stop_words]
    
    return ' '.join(cleaned_words)

In [14]:
# Apply the function to the transcript column
print('Cleaning transcripts... this might take a moment.')
df['clean_transcript'] = df['transcript'].apply(clean_text)

# Verify the difference
print('Done!')
print('\n--- Original Text (First 50 chars) ---')
print(df['transcript'].iloc[0][:50])

print('\n--- Cleaned Text (First 50 chars) ---')
print(df['clean_transcript'].iloc[0][:50])

Cleaning transcripts... this might take a moment.
Done!

--- Original Text (First 50 chars) ---
Good morning. How are you?(Laughter)It's been grea

--- Cleaned Text (First 50 chars) ---
good morning laughter great blown away whole thing


### TF-IDF Vectorization and Cosine Similarity

#### Initialize the Vectorizer

In [17]:
# Even though we cleaned the text, passing stop_words='english'
# acts as a double-check to remove any common words we might have missed.
tfidf = TfidfVectorizer(stop_words='english')

In [18]:
# Construct the TF-IDF Matrix
print('Vectorizing...')
tfidf_matrix = tfidf.fit_transform(df['clean_transcript'])

# Check the shape
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")

Vectorizing...
TF-IDF Matrix Shape: (2467, 57085)


#### Calculate Similarity

In [19]:
# Compute the Cosine Similarity Matrix
print('Calculating similarity scores...')
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

print(f"Cosine Similarity Matrix Shape: {cosine_sim.shape}")

# Let's peek at the similarity of the first talk with itself (should be 1.0)
print(f"Similarity of talk 0 with talk 0: {cosine_sim[0][0]}")

Calculating similarity scores...
Cosine Similarity Matrix Shape: (2467, 2467)
Similarity of talk 0 with talk 0: 0.9999999999999983


### Recommendation Function

In [20]:
# Create a reverse mapping of Titles to Indices
title_to_index = pd.Series(df.index, index=df['title']).drop_duplicates()

In [23]:
def get_recommendations(title, cosine_sim=cosine_sim):
    if title not in title_to_index:
        return 'Error: Title not found in dataset.'
    
    idx = title_to_index[title]
    
    # Get the similarity scores of that talk with all others
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the talks by similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 5 (ignoring position 0, which is the talk itself)
    sim_scores = sim_scores[1:6]
    
    talk_indices = [i[0] for i in sim_scores]
    
    # Return the titles and authors
    return df[['title', 'main_speaker']].iloc[talk_indices]

### Test

In [None]:
# Let's test with the most famous TED talk: Ken Robinson
input_talk = "Do schools kill creativity?"

print(f"Recommendations based on: '{input_talk}'")
print("-" * 50)
print(get_recommendations(input_talk))

Recommendations based on: 'Do schools kill creativity?'
--------------------------------------------------
                                                  title    main_speaker
1421             How to escape education's death valley    Ken Robinson
663                   Bring on the learning revolution!    Ken Robinson
1968  How I stopped the Taliban from shutting down m...  Sakena Yacoobi
21                                      Nerdcore comedy        Ze Frank
1832        How to run a company with (almost) no rules  Ricardo Semler
