In [4]:
import pandas as pd
import numpy as np

# 1. Load the big data (Make sure train.csv is in your sidebar!)
df = pd.read_csv('train.csv')

# 2. Create a balanced sample (10k duplicate, 10k not duplicate)
new_df = pd.concat([
    df[df['is_duplicate'] == 0].sample(10000, random_state=42),
    df[df['is_duplicate'] == 1].sample(10000, random_state=42)
])

# 3. Shuffle and save
new_df = new_df.sample(20000, random_state=42).reset_index(drop=True)
new_df.to_csv('sampled_data.csv', index=False)

print("Success! 'sampled_data.csv' created.")

Success! 'sampled_data.csv' created.


In [7]:
import pandas as pd

# 1. Load the data you already have
df = pd.read_csv('final_featured_data.csv')

# 2. Add the 4 missing columns
df['len_q1'] = df['question1'].astype(str).apply(len)
df['len_q2'] = df['question2'].astype(str).apply(len)
df['words_q1'] = df['question1'].astype(str).apply(lambda x: len(x.split()))
df['words_q2'] = df['question2'].astype(str).apply(lambda x: len(x.split()))

# 3. Save it again
df.to_csv('final_featured_data.csv', index=False)

print("Success! Missing columns added.")
print(df.columns)

Success! Missing columns added.
Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
       'fuzz_score', 'fuzz_partial', 'token_set', 'token_sort', 'len_q1',
       'len_q2', 'words_q1', 'words_q2'],
      dtype='object')


In [6]:
df = pd.read_csv('final_featured_data.csv')
print(df.columns)

Index(['id', 'qid1', 'qid2', 'question1', 'question2', 'is_duplicate',
       'fuzz_score', 'fuzz_partial', 'token_set', 'token_sort'],
      dtype='object')


In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle

# 1. Load the data that has our fuzzy scores
df = pd.read_csv('final_featured_data.csv')

# 2. Select the features (columns) the AI will look at
# These MUST be in this exact order
X = df[['len_q1', 'len_q2', 'words_q1', 'words_q2', 'fuzz_score', 'fuzz_partial', 'token_set', 'token_sort']]
y = df['is_duplicate']

# 3. Split data: 80% to learn from, 20% to test its knowledge
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training the Random Forest model... please wait a few seconds.")

# 4. Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 5. Check how smart it is
y_pred = rf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

# 6. SAVE THE BRAIN (model.pkl)
pickle.dump(rf, open('model.pkl', 'wb'))
print("Model saved as 'model.pkl'!")

Training the Random Forest model... please wait a few seconds.
Model Accuracy: 71.92%
Model saved as 'model.pkl'!
