In [None]:
!pip install --upgrade pip jupyter ipywidgets
!pip install matplotlib numpy pandas seaborn prophet scikit-learn

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from prophet import Prophet
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.preprocessing import StandardScaler

# Automating Data Cleaning

In [None]:
# Sample dataset with missing values, outliers, and categorical data
data = {
    'Employee': ['Alice', 'Bob', 'Charlie', 'David'],
    'Sales': [1000, 2000, np.nan, 50000],  # Outlier in David's sales
    'Experience (Years)': [3, 5, 2, np.nan],
    'Department': ['Sales', 'HR', 'IT', 'Sales'],
    'Bonus': [500, 700, 300, 10000]  # Outlier in Bonus
}
df = pd.DataFrame(data)

# Step 1: Visualize Original Data
print("\n--- Original Dataset ---")
print(df)

plt.figure(figsize=(10, 6))
sns.barplot(x='Employee', y='Sales', data=df)
plt.title("Original Sales Data")
plt.ylabel("Sales")
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(x='Employee', y='Experience (Years)', data=df)
plt.title("Original Experience Data")
plt.ylabel("Experience (Years)")
plt.show()

# Observations: Missing values and potential outliers are visible.

# Step 2: Handle Missing Values
print("\n--- Handling Missing Values ---")
imputer = SimpleImputer(strategy='mean')
df['Experience (Years)'] = imputer.fit_transform(df[['Experience (Years)']])
df['Sales'] = imputer.fit_transform(df[['Sales']])
print("Dataset After Imputation:")
print(df)

plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values After Imputation")
plt.show()

# Step 3: Handle Outliers
print("\n--- Handling Outliers ---")
for column in ['Sales', 'Bonus']:
    q95 = df[column].quantile(0.95)
    df[column] = np.where(df[column] > q95, q95, df[column])
    df[f'{column} Capped'] = np.where(df[column] == q95, "Yes", "No")

print("Dataset After Outlier Capping:")
print(df)

plt.figure(figsize=(10, 6))
sns.boxplot(data=df[['Sales', 'Bonus']])
plt.title("Boxplot After Outlier Handling")
plt.show()

# Step 4: Encode Categorical Data
print("\n--- Encoding Categorical Data ---")
label_encoder = LabelEncoder()
df['Department Encoded'] = label_encoder.fit_transform(df['Department'])
print("Encoded Departments:")
print(df[['Department', 'Department Encoded']])

plt.figure(figsize=(10, 6))
sns.barplot(x='Department', y='Department Encoded', data=df)
plt.title("Encoded Department Data")
plt.show()

# Step 5: Standardize Numerical Features
print("\n--- Standardizing Numerical Features ---")
scaler = StandardScaler()
numerical_features = ['Sales', 'Experience (Years)', 'Bonus']
df_standardized = df.copy()
df_standardized[numerical_features] = scaler.fit_transform(df_standardized[numerical_features])
print("Standardized Numerical Features:")
print(df_standardized[numerical_features])

plt.figure(figsize=(10, 6))
sns.boxplot(data=df_standardized[numerical_features])
plt.title("Boxplot of Standardized Features")
plt.show()

# Step 6: Normalize Numerical Features
print("\n--- Normalizing Numerical Features ---")
min_max_scaler = MinMaxScaler()
df_normalized = df.copy()
df_normalized[numerical_features] = min_max_scaler.fit_transform(df_normalized[numerical_features])
print("Normalized Numerical Features:")
print(df_normalized[numerical_features])

plt.figure(figsize=(10, 6))
sns.boxplot(data=df_normalized[numerical_features])
plt.title("Boxplot of Normalized Features")
plt.show()

# Step 7: Correlation Analysis for Insights
print("\n--- Correlation Analysis ---")
numeric_columns = df_standardized.select_dtypes(include=[np.number])
correlation_matrix = numeric_columns.corr()
print("Correlation Matrix (Standardized Data):")
print(correlation_matrix)

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap (Standardized Data)")
plt.show()

# Final Cleaned Dataset
print("\n--- Final Cleaned Dataset ---")
print(df)

# Employee Productivity Analysis

In [None]:
# Expanded Example Dataset
data = {
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Hank', 'Ivy', 'Jack', 'Karen', 'Leo'],
    'Sales': [12000, 15000, 10000, 17000, 14000, 9000, 20000, 11000, 18000, 16000, 13000, 8000],
    'Hours Worked': [160, 170, 150, 180, 165, 145, 200, 155, 190, 175, 160, 140],
    'Customer Satisfaction': [4.2, 4.5, 3.8, 4.8, 4.3, 3.5, 4.9, 3.9, 4.7, 4.6, 4.1, 3.2]
}
df = pd.DataFrame(data)

print("\n--- Initial Dataset ---")
print(df)

# Step 1: Visualize Original Data
print("\n--- Visualizing Original Data Distribution ---")
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Sales', y='Hours Worked', s=100)
plt.title("Original Data Distribution")
plt.xlabel("Sales")
plt.ylabel("Hours Worked")
plt.grid(True)
plt.show()

# Step 2: Normalize Data for Clustering
print("\n--- Normalizing Data for Clustering ---")
scaler = StandardScaler()
features = scaler.fit_transform(df[['Sales', 'Hours Worked', 'Customer Satisfaction']])
print("Normalized Features (first 5 rows):")
print(pd.DataFrame(features, columns=['Sales', 'Hours Worked', 'Customer Satisfaction']).head())

# Step 3: Apply KMeans Clustering
print("\n--- Applying KMeans Clustering ---")
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['Cluster'] = kmeans.fit_predict(features)
print(f"Cluster Assignments:\n{df[['Employee', 'Cluster']]}")

# Calculate silhouette score
sil_score = silhouette_score(features, kmeans.labels_)
print(f"Silhouette Score for {n_clusters} clusters: {sil_score:.2f}")

# Add cluster centroids for visualization
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
print("\nCluster Centroids (Original Scale):")
print(pd.DataFrame(centroids, columns=['Sales', 'Hours Worked', 'Customer Satisfaction']))

# Step 4: Visualize Clusters with Centroids
print("\n--- Visualizing Clusters with Centroids ---")
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Sales', y='Hours Worked', hue='Cluster', palette='viridis', s=100)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', marker='X', s=200, label='Centroids')
plt.title("Employee Clusters with Centroids")
plt.xlabel("Sales")
plt.ylabel("Hours Worked")
plt.legend(title="Cluster")
plt.grid(True)
plt.show()

# Step 5: Cluster Analysis - Boxplots
print("\n--- Analyzing Clusters with Boxplots ---")
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Cluster', y='Sales', palette='viridis')
plt.title("Sales Distribution by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Sales")
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Cluster', y='Hours Worked', palette='viridis')
plt.title("Hours Worked Distribution by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Hours Worked")
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Cluster', y='Customer Satisfaction', palette='viridis')
plt.title("Customer Satisfaction Distribution by Cluster")
plt.xlabel("Cluster")
plt.ylabel("Customer Satisfaction")
plt.show()

# Step 6: Add Cluster Summary (Exclude Non-Numeric Columns)
print("\n--- Adding Cluster Summary ---")
summary = df.drop(columns=['Employee']).groupby('Cluster').mean().reset_index()
print(summary)

# Step 7: Pairplot for Cluster Analysis
print("\n--- Creating Pairplot for Clusters ---")
sns.pairplot(df, vars=['Sales', 'Hours Worked', 'Customer Satisfaction'], hue='Cluster', palette='viridis', corner=True,
             diag_kind='kde')
plt.suptitle("Pairplot of Clusters", y=1.02)
plt.show()


# Forecasting Sales or Demand

In [None]:
# Generate a more realistic dataset with seasonality, trend, and random noise
np.random.seed(42)  # For reproducibility
months = pd.date_range(start='2023-01-01', periods=12, freq='M')
trend = np.linspace(2000, 4500, 12)  # Linear growth trend
seasonality = 300 * np.sin(2 * np.pi * (months.month - 1) / 12)  # Yearly seasonality
noise = np.random.normal(0, 200, 12)  # Random noise

# Combine components to create 'y' (sales)
sales = trend + seasonality + noise

# Create the DataFrame
data = {
    'ds': months,
    'y': sales
}
df = pd.DataFrame(data)

print("\n--- Dataset ---")
print(df)

# Initialize Prophet model
model = Prophet(yearly_seasonality=True, weekly_seasonality=False)
model.fit(df)

# Make future predictions
future = model.make_future_dataframe(periods=6, freq='M')
forecast = model.predict(future)

# Add forecasted period annotation
forecast_period = future[future['ds'] > df['ds'].max()]
print("\n--- Forecast Period ---")
print(forecast_period)

print("\n--- Forecast Summary ---")
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(6))

# Plot the forecast with better annotations
fig1 = model.plot(forecast)
plt.title("Sales Forecast with Future Predictions")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.grid(True)

# Highlight forecasted periods
plt.axvspan(df['ds'].max(), future['ds'].max(), color='orange', alpha=0.2, label="Forecasted Period")
plt.legend()
plt.show()

# Plot forecast components
fig2 = model.plot_components(forecast)
plt.suptitle("Forecast Components", y=1.02)
plt.show()

# Extended visualizations
print("\n--- Extended Visualizations ---")

# Visualize actual vs predicted sales
plt.figure(figsize=(10, 6))
plt.plot(df['ds'], df['y'], label='Actual Sales', marker='o')
plt.plot(forecast['ds'], forecast['yhat'], label='Predicted Sales', linestyle='--', color='orange')
plt.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], color='orange', alpha=0.2,
                 label='Uncertainty Interval')
plt.axvline(df['ds'].max(), color='red', linestyle='--', label='Forecast Start')
plt.title("Actual vs Predicted Sales")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend()
plt.grid(True)
plt.show()

# Highlight seasonality trends (use 'yearly' if it exists in the forecast)
if 'yearly' in forecast.columns:
    print("\n--- Highlighting Yearly Seasonality Trend ---")
    plt.figure(figsize=(10, 6))
    plt.plot(forecast['ds'], forecast['yearly'], label='Yearly Seasonality', color='green')
    plt.title("Yearly Seasonality Trend")
    plt.xlabel("Date")
    plt.ylabel("Yearly Seasonality Impact")
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("\nYearly seasonality is not available in the forecast.")


# Real-Time Expense Monitoring

In [None]:
# Example dataset
data = {
    'Description': ['Coffee', 'Office Supplies', 'Taxi', 'Lunch', 'Notebook', 'Dinner', 'Train Ticket', 'Pens', 'Uber',
                    'Sandwich'],
    'Amount': [4.5, 25.0, 30.0, 12.0, 15.0, 20.0, 50.0, 5.0, 35.0, 8.0],
    'Category': ['Food', 'Office', 'Transport', 'Food', 'Office', 'Food', 'Transport', 'Office', 'Transport', 'Food']
}
df = pd.DataFrame(data)

# Print the dataset
print("Dataset:")
print(df)

# Visualize the dataset
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Category', palette='viridis')
plt.title('Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Visualize amount distribution per category
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Category', y='Amount', palette='coolwarm')
plt.title('Amount Distribution by Category')
plt.xlabel('Category')
plt.ylabel('Amount')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Transform data for the classifier
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Description'])
y = df['Category']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the classifier
clf = RandomForestClassifier(random_state=42).fit(X_train, y_train)

# Evaluate the classifier
y_pred = clf.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d',
            xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

# Print feature importance
feature_importances = clf.feature_importances_
feature_names = vectorizer.get_feature_names_out()
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print("\nTop Features by Importance:")
print(importance_df.head(10))

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df.head(10), x='Importance', y='Feature', palette='magma')
plt.title('Top 10 Features by Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# Predict new expenses
new_data = ['Uber', 'Printer Ink', 'Pizza', 'Train', 'Laptop']
X_new = vectorizer.transform(new_data)
predictions = clf.predict(X_new)

# Print predictions
prediction_df = pd.DataFrame({'Expense': new_data, 'Predicted Category': predictions})
print("\nPredictions:")
print(prediction_df)

# Visualize predictions
plt.figure(figsize=(10, 6))
sns.barplot(data=prediction_df, x='Expense', y=range(len(predictions)), hue='Predicted Category', dodge=False,
            palette='Set2')
plt.title('Predicted Categories for New Expenses')
plt.xlabel('Expense')
plt.ylabel('Index')
plt.legend(title='Category')
plt.tight_layout()
plt.show()


# Employee Attrition Prediction

In [None]:
# Sample dataset
data = {
    'JobSatisfaction': [3, 2, 4, 1, 3, 2, 5, 4, 3, 1],
    'YearsAtCompany': [5, 1, 7, 2, 3, 1, 8, 6, 4, 2],
    'PerformanceRating': [4, 3, 5, 2, 4, 3, 5, 4, 4, 2],
    'MonthlyIncome': [5000, 2000, 8000, 2500, 4500, 2200, 8500, 7000, 4000, 2100],
    'Left': [0, 1, 0, 1, 0, 1, 0, 0, 0, 1]  # 1 = Left the company, 0 = Stayed
}
df = pd.DataFrame(data)

# Print dataset info
print("Dataset Summary:")
print(df.describe())
print("\nDataset Head:")
print(df.head())

# Visualize data distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['MonthlyIncome'], kde=True, color='blue', bins=10)
plt.title('Monthly Income Distribution')
plt.xlabel('Monthly Income')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Left', y='MonthlyIncome', palette='Set2')
plt.title('Monthly Income by Attrition Status')
plt.xlabel('Attrition Status (Left: 1, Stayed: 0)')
plt.ylabel('Monthly Income')
plt.tight_layout()
plt.show()

# Features and target
X = df[['JobSatisfaction', 'YearsAtCompany', 'PerformanceRating', 'MonthlyIncome']]
y = df['Left']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Print model evaluation metrics
from sklearn.metrics import classification_report, accuracy_score

print("\nClassification Report:")
print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")

# Feature importance using permutation importance
perm_importance = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
perm_importance_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print("\nPermutation Feature Importance:")
print(perm_importance_df)

# Bar plot for feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=perm_importance_df, x='Importance', y='Feature', palette='viridis')
plt.title('Permutation Feature Importance')
plt.xlabel('Mean Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# Visualize feature importance for a single prediction (simple example)
sample_idx = 0
sample_features = X_test.iloc[sample_idx]
sample_prediction = model.predict_proba([sample_features])[0]

print(f"\nPrediction probabilities for sample {sample_idx}:")
for class_idx, prob in enumerate(sample_prediction):
    print(f"Class {class_idx}: {prob:.2f}")

plt.figure(figsize=(10, 6))
plt.bar(X_test.columns, sample_features, color='teal')
plt.title(f'Feature Values for Sample {sample_idx}')
plt.xlabel('Feature')
plt.ylabel('Value')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
