In [1]:
import pandas as pd
from scipy.stats import chi2_contingency

# Load the data
df = pd.read_csv('data.csv').drop_duplicates(subset=['id'])
df = df[~df['tags'].str.contains('shorts', case=False, na=False)]

# Step 1: Filter for machining videos
# Adjust the keywords based on your understanding of the dataset
machining_keywords = ['machining', 'lathe', 'milling','maker', 'CNC']
df['is_machining'] = df['tags'].fillna('').apply(lambda x: any(keyword in x for keyword in machining_keywords))

# Step 2: Analyze the proportion of liked machining videos
machining_videos = df[df['is_machining']]
liked_machining_videos = machining_videos[machining_videos['liked'] == 1]

# Calculate proportions
total_machining_videos = len(machining_videos)
liked_machining_videos_count = len(liked_machining_videos)
proportion = liked_machining_videos_count / total_machining_videos
print(f"Proportion of liked machining videos: {proportion}")

# Step 3: Statistical Testing
# Create a contingency table
contingency_table = [
    [liked_machining_videos_count, total_machining_videos - liked_machining_videos_count],
    [df['liked'].sum() - liked_machining_videos_count, len(df) - total_machining_videos - (df['liked'].sum() - liked_machining_videos_count)]
]

chi2, p, dof, expected = chi2_contingency(contingency_table)

# Interpret the result
alpha = 0.05  # significance level
print(f"Chi-Squared Test: chi2 = {chi2}, p-value = {p}")
if p < alpha:
    print("Reject the null hypothesis - there is a significant difference in liking machining videos.")
else:
    print("Fail to reject the null hypothesis - there is no significant difference in liking machining videos.")

Proportion of liked machining videos: 0.09302325581395349
Chi-Squared Test: chi2 = 20.9919720663117, p-value = 4.61211897573134e-06
Reject the null hypothesis - there is a significant difference in liking machining videos.


There we go, I do like machining videos and it's now statistically proven.