In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning - scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score

# For recommendation systems specifically
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD  # for matrix factorization

# If using collaborative filtering
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

# Warnings control
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
%matplotlib inline

In [3]:
# 1. Load and clean data
df = pd.read_csv('movies_data.csv')
df['overview'] = df['overview'].fillna('')
df = df.drop_duplicates(subset='title', keep='first')

# 2. Save cleaned CSV with proper format
df.to_csv('movies_clean.csv', index=False, sep='|')  # Use pipe separator

# 3. Regenerate TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['overview'])

# 4. Regenerate similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 5. Save everything
import joblib
joblib.dump(cosine_sim, 'cosine_similarity.pkl')
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')  # Optional backup

# 6. Push to git again

['tfidf_matrix.pkl']