In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Load CNC Dataset
cnc_data = pd.read_excel("C:\\Users\\Bunny\\Downloads\\CNC_Machine_Data_CNC003.xlsx")
cnc_data.head()

: 

In [None]:
dropped_cnc = cnc_data.dropna()
dropped_cnc.head()

In [None]:
# Data Preprocessing
# Dropping irrelevant columns
cnc_data_cleaned = dropped_cnc.drop(['Timestamp', 'Machine ID', 'Production Line', 'Error Code'], axis=1)

# Encoding categorical columns
categorical_columns = cnc_data_cleaned.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    cnc_data_cleaned[col] = label_encoder.fit_transform(cnc_data_cleaned[col].astype(str))

In [None]:
cnc_data_cleaned.head()
cnc_data_cleaned.info()


In [None]:
# Splitting features and target
X = cnc_data_cleaned.drop(['Maintenance Required'], axis=1)
y = cnc_data_cleaned['Maintenance Required']

# Splitting into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Models to Evaluate
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'Gaussian Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'LDA': LDA(),
    'QDA': QDA()
}

In [None]:
# Evaluate Each Model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results[name] = classification_report(y_test, y_pred, output_dict=True)

In [None]:
# Displaying classification reports
for name, report in results.items():
    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_test, models[name].predict(X_test)))

In [None]:
from keras.layers import Dense, Input
# Building a Sequential Model with an Input layer
sequential_model = Sequential([
    Input(shape=(X_train.shape[1],)),  # Define input shape explicitly here
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

sequential_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Train Sequential Model
sequential_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# Evaluate Sequential Model
sequential_predictions = (sequential_model.predict(X_test) > 0.5).astype(int)
print("\nSequential Model Classification Report:\n")
print(classification_report(y_test, sequential_predictions))


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the parameter grid
param_grid = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

In [None]:
# Splitting features and target
X = cnc_data_cleaned.drop(['Maintenance Required'], axis=1)
y = cnc_data_cleaned['Maintenance Required']

# Splitting into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
# Evaluate the optimized model on the test set
y_pred = best_model.predict(X_test)
optimized_report = classification_report(y_test, y_pred)

print("Best Parameters:", best_params)
print("\nClassification Report for Optimized Decision Tree:\n")
print(optimized_report)


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
import numpy as np

# Define the parameter distributions
param_distributions = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['gini', 'entropy']
}

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)


In [None]:
# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=dt_model,
                                   param_distributions=param_distributions,
                                   n_iter=20,  # Number of random combinations to try
                                   cv=5,  # Number of cross-validation folds
                                   scoring='accuracy',
                                   verbose=1,
                                   random_state=42,
                                   n_jobs=-1)
random_search.fit(X_train, y_train)

# Get the best parameters and estimator
best_params = random_search.best_params_
best_model = random_search.best_estimator_

In [None]:
# Evaluate the optimized model
y_pred = best_model.predict(X_test)
optimized_report = classification_report(y_test, y_pred)

# Display the results
print("Best Parameters:", best_params)
print("\nClassification Report for Optimized Decision Tree:\n")
print(optimized_report)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

# Define the parameter grid
param_grid = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize variables to store the best parameters and score
best_params = None
best_score = 0

In [None]:
# Convert y_train to a numpy array
y_train_array = y_train.values if hasattr(y_train, "values") else y_train

# Perform Stratified K-Fold Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for max_depth in param_grid['max_depth']:
    for min_samples_split in param_grid['min_samples_split']:
        for min_samples_leaf in param_grid['min_samples_leaf']:
            fold_scores = []

            for train_idx, val_idx in kf.split(X_train, y_train_array):
                # Split the data into train and validation sets
                X_fold_train, X_fold_val = X_train[train_idx], X_train[val_idx]
                y_fold_train, y_fold_val = y_train_array[train_idx], y_train_array[val_idx]

                # Initialize the model with the current set of parameters
                model = DecisionTreeClassifier(
                    max_depth=max_depth,
                    min_samples_split=min_samples_split,
                    min_samples_leaf=min_samples_leaf,
                    random_state=42
                )


In [None]:
# Train the model
model.fit(X_fold_train, y_fold_train)

# Validate the model
y_pred = model.predict(X_fold_val)
fold_scores.append(accuracy_score(y_fold_val, y_pred))

# Calculate the average score across folds
avg_score = np.mean(fold_scores)

# Update the best parameters if the current score is better
if avg_score > best_score:
 best_score = avg_score
best_params = {
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf
}

In [None]:
# Train the final model with the best parameters on the full training data
final_model = DecisionTreeClassifier(
    **best_params, random_state=42
)
final_model.fit(X_train, y_train)

# Evaluate on the test set
y_test_pred = final_model.predict(X_test)
final_report = classification_report(y_test, y_test_pred)

# Display the results
print("Best Parameters:", best_params)
print("Best Cross-Validated Accuracy:", best_score)
print("\nClassification Report for Final Model:\n")
print(final_report)

In [None]:
# Convert X_train and X_test to DataFrames with column names
X_train_df = pd.DataFrame(X_train, columns=cnc_data_cleaned.drop("Maintenance Required", axis=1).columns)
X_test_df = pd.DataFrame(X_test, columns=cnc_data_cleaned.drop("Maintenance Required", axis=1).columns)

# Calculate the correlation matrix
correlation_matrix = X_train_df.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()

# Remove highly correlated features
threshold = 0.9  # Correlation threshold
correlated_features = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > threshold:
            correlated_features.add(correlation_matrix.columns[i])

# Drop correlated features
X_train_uncorrelated = X_train_df.drop(columns=correlated_features)
X_test_uncorrelated = X_test_df.drop(columns=correlated_features)

# Print the features retained after removing highly correlated features
print("Features retained after correlation analysis:", X_train_uncorrelated.columns.tolist())


In [None]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
feature_importances = pd.Series(rf_model.feature_importances_, index=X_train_df.columns)

# Plot feature importances
plt.figure(figsize=(12, 6))
feature_importances.sort_values(ascending=False).plot(kind='bar')
plt.title("Feature Importance")
plt.ylabel("Importance")
plt.show()

# Select top features based on importance
top_features = feature_importances.nlargest(10).index  # Select top 10 features
X_train_important = X_train_df[top_features]
X_test_important = X_test_df[top_features]


In [None]:
from sklearn.feature_selection import mutual_info_classif

# Calculate mutual information scores
mi_scores = mutual_info_classif(X_train, y_train)

# Convert scores to a Series with column names
mi_scores_series = pd.Series(mi_scores, index=X_train_df.columns)

# Plot mutual information scores
plt.figure(figsize=(12, 6))
mi_scores_series.sort_values(ascending=False).plot(kind='bar')
plt.title("Mutual Information Scores")
plt.ylabel("Mutual Information")
plt.show()

# Select features with high mutual information
threshold = 0.01  # Minimum score to keep a feature
selected_features = mi_scores_series[mi_scores_series > threshold].index
X_train_selected = X_train_df[selected_features]
X_test_selected = X_test_df[selected_features]

In [None]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# Initialize a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Perform RFE
rfe = RFE(estimator=dt_model, n_features_to_select=10)
rfe.fit(X_train_df, y_train)

# Get selected features
selected_features = X_train_df.columns[rfe.support_]

# Transform the dataset
X_train_rfe = X_train_df[selected_features]
X_test_rfe = X_test_df[selected_features]

print("Selected Features:", selected_features.tolist())


In [None]:
# Distribution of the target variable
target_distribution = cnc_data_cleaned['Maintenance Required'].value_counts()
print("Target Variable Distribution:\n", target_distribution)

# Percentage distribution
target_percentage = cnc_data_cleaned['Maintenance Required'].value_counts(normalize=True) * 100
print("\nTarget Variable Percentage Distribution:\n", target_percentage)


In [None]:
# Summarize features for each target class
summary = cnc_data_cleaned.groupby('Maintenance Required').mean()
print("\nMean Feature Values by Target Class:\n", summary)
