<a href="https://colab.research.google.com/github/manikanta741/Data-Science/blob/main/data%20modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Data Modeling:
#Data modeling involves applying statistical models or machine learning algorithms to extract insights and make predictions.
#Below is a step-by-step guide to building, training, and evaluating predictive models.
"""Applying Statistical Models & Machine Learning Algorithms
Different models can be used based on the type of problem:

Problem Type	Example Models
Regression (Predicting continuous values)	Linear Regression, Decision Trees, Random Forest, XGBoost
Classification (Predicting categories)	Logistic Regression, Random Forest, SVM, Neural Networks
Clustering (Grouping similar data)	K-Means, DBSCAN, Hierarchical Clustering"""

In [None]:
#Linear Regression (For Continuous Predictions)
#Used when the target variable is continuous (e.g., house prices, sales, temperatures).
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Sample dataset
X = df[['Experience', 'Age', 'Education_Level']]
y = df['Salary']

# Split data into training & testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)


In [None]:
#Logistic Regression (For Classification Problems)
#Used when target variable is categorical (e.g., spam detection, churn prediction).
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)


In [None]:
#decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample dataset
X = df[['Age', 'Salary']]
y = df['Purchased']  # Binary classification (Yes/No)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree Model
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


In [None]:
#Support Vector Regression (SVR) 🔵
Uses Support Vector Machines (SVM) for regression.
✅ Works well for high-dimensional data.

from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)




In [None]:
#Random Forest (For Both Regression & Classification)
#A powerful ensemble model that reduces overfitting.


from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
"""Training Predictive Models"""
Key steps in training models:

1.Split the dataset (Training: 80%, Testing: 20%).
2.Fit the model using the training data.
3.Tune hyperparameters (GridSearchCV, RandomizedSearchCV).
4.Make predictions on new data.

In [None]:
"""Evaluating Model Performance
Once the model is trained, we need to check how well it performs.

For Regression Models:
Metric	Description
RMSE (Root Mean Squared Error)	Measures average prediction error. Lower is better.
R² (R-squared)	Measures how well features explain the target variable. Closer to 1 is better."""

from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")


In [None]:
#evaluation metrics for classsification
#accuracy
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


#Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)


#precision
from sklearn.metrics import precision_score
precision = precision_score(y_test, y_pred)
print(f"Precision: {precision}")



#Recall (Sensitivity or True Positive Rate)
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)
print(f"Recall: {recall}")


#f1-score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1}")

#roc
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, y_pred_proba)  # y_pred_proba is probability predictions\
print(f"AUC Score: {auc}")

#Classification Report (All Metrics Together)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))
