Load the dataset from a CSV file.

In [None]:
import pandas as pd
df = pd.read_csv('data.csv')

Explore the dataset with summary statistics.

In [None]:
df.describe()

Remove duplicate entries from the dataset.

In [None]:
df = df.drop_duplicates()

Encode categorical features 'cut', 'clarity', and 'color' to numerical values.

In [None]:
df['cut'] = df['cut'].astype('category').cat.codes
 df['clarity'] = df['clarity'].astype('category').cat.codes
 df['color'] = df['color'].astype('category').cat.codes

Visualize data relationships using pair plots.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.pairplot(df)
plt.show()

Generate a correlation heatmap to identify feature relationships.

In [None]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

Prepare data by splitting it into training and testing sets.

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Scale features for better model performance.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Explore various models; here a Random Forest Classifier is used.

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

Check model performance by evaluating accuracy.

In [None]:
from sklearn.metrics import accuracy_score
predictions = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, predictions)

Train and save the final model.

In [None]:
import joblib
joblib.dump(model, 'final_model.pkl')

Validate the model with test data and print results.

In [None]:
validation_accuracy = accuracy_score(y_test, predictions)
print('Validation Accuracy:', validation_accuracy)

Evaluate results with visualizations to understand target distribution.

In [None]:
import matplotlib.pyplot as plt
sns.countplot(x='target', data=df)
plt.show()