In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# 1. Dataset Preparation (Simulated example dataset)
data = {
    'content': [
        'This tutorial explains how to solve calculus problems',
        'Watch the new movie trailer for the latest action film',
        'Learn how to code in Python with this easy guide',
        'Here are the top 10 places to visit this summer',
        'Studying tips to improve your concentration and focus',
        'New gaming console released with advanced features',
        'An introduction to quantum mechanics for beginners',
        'Latest music video from popular artist just released'
    ],
    'label': ['study', 'non-study', 'study', 'non-study', 'study', 'non-study', 'study', 'non-study']
}

# Creating a DataFrame
df = pd.DataFrame(data)

# Display the dataset
print("Dataset:")
print(df.head())

# 2. Preprocessing and Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['content'])

# Labels (study or non-study)
y = df['label']

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Training the SVM Model
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 4. Testing with New Content
new_content = [
    "How to prepare for your math exam effectively",
    "Watch the new comedy show now streaming",
    "Python programming course for beginners",
    "Latest fashion trends for 2024"
]
new_content_tfidf = vectorizer.transform(new_content)

# Predict the label (study or non-study) for new content
predictions = model.predict(new_content_tfidf)

print("\nNew Content Classification:")
for content, label in zip(new_content, predictions):
    print(f'Content: "{content}" is classified as: {label}')


In [None]:
import joblib

# Save the trained model
joblib.dump(model, 'content_filter_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')
