In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load dataset
dataset_path = '/content/_ambari.csv'
df = pd.read_csv(dataset_path, sep=',', encoding='ISO-8859-1', header=0)

# Show dataset info and preview
print('Dataset Info:')
print(df.info())
print('\nFirst 5 rows:')
print(df.head())

# Handle missing values
df.replace(['null', 'NULL', 'NaN', '', ' '], np.nan, inplace=True)

# Fill missing numeric columns with median
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.median()))

# Fill missing categorical columns with mode
categorical_cols = df.select_dtypes(include=[object]).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]))

# Convert datetime columns
datetime_cols = ['created', 'resolved', 'assigned']
for col in datetime_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Feature engineering

# 1. Time to resolve (hours)
df['time_to_resolve'] = (df['resolved'] - df['created']).dt.total_seconds() / 3600

# 2. Time to assign (hours)
if 'assigned' in df.columns:
    df['time_to_assign'] = (df['assigned'] - df['created']).dt.total_seconds() / 3600
    df['time_to_assign'] = df['time_to_assign'].fillna(0)

# 3. Time to fix (80% of time to resolve)
df['time_to_fix'] = df['time_to_resolve'] * 0.8

# 4. Description word count
if 'description' in df.columns:
    df['description_word_count'] = df['description'].astype(str).str.split().str.len().fillna(0)
else:
    df['description_word_count'] = 0

# 5. Number of commenters
if 'commenter' in df.columns:
    df['commenter_count'] = df['commenter'].astype(str).str.split(',').str.len().fillna(0)
else:
    df['commenter_count'] = 0

# 6. Sum of votes and watches
for col in ['votes', 'watches']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
df['votes_watches_sum'] = df[['votes', 'watches']].sum(axis=1)

# 7. Binary flags for critical labels
critical_labels = ['Surprising', 'Dormant', 'Blocker', 'Security', 'Performance', 'Breakage']
for label in critical_labels:
    if label in df.columns:
        df[f'is_{label.lower()}'] = (df[label].notnull()).astype(int)
    else:
        df[f'is_{label.lower()}'] = 0

# 8. Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = ['type', 'status', 'resolution', 'component', 'priority', 'reporter', 'assignee']
for col in categorical_cols:
    if col in df.columns:
        df[col] = label_encoder.fit_transform(df[col].astype(str))

# 9. Interaction term: Priority * Time to resolve
if 'priority' in df.columns:
    df['priority_time_interaction'] = df['priority'] * df['time_to_resolve']
else:
    df['priority_time_interaction'] = 0

# Save cleaned and engineered dataset
cleaned_engineered_data_path = 'cleaned_engineered_dataset1.csv'
df.to_csv(cleaned_engineered_data_path, index=False)
print(f'Cleaned and engineered dataset saved at: {cleaned_engineered_data_path}')

# Show sample of engineered features
print('\nSample of engineered features:')
print(df[['time_to_resolve', 'time_to_assign', 'time_to_fix', 'description_word_count',
          'commenter_count', 'votes_watches_sum', 'priority_time_interaction']].head())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ï»¿issue_id        1000 non-null   float64
 1   type               1000 non-null   object 
 2   status             1000 non-null   object 
 3   resolution         1000 non-null   object 
 4   component          1000 non-null   object 
 5   priority           1000 non-null   object 
 6   reporter           1000 non-null   object 
 7   created            1000 non-null   object 
 8   assigned           997 non-null    object 
 9   assignee           1000 non-null   object 
 10  resolved           1000 non-null   object 
 11  created.1          1000 non-null   float64
 12  assigned.1         997 non-null    float64
 13  summary            1000 non-null   object 
 14  description        1000 non-null   object 
 15  affected_version   1000 non-null   object 
 16  fixed_vers

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, BatchNormalization, Flatten, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

# Load dataset
dataset_path = 'cleaned_engineered_dataset1.csv'
df = pd.read_csv(dataset_path)

# Handling missing values (ensure no NaNs)
df.fillna(0, inplace=True)

# Convert categorical features to numerical using LabelEncoder
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Define target variable
target = 'priority'
y = df[target].values

# Convert target to binary classification (e.g., median split)
median_time = np.median(y)
y_binary = (y >= median_time).astype(int)

# Text processing for 'description' column (if available)
if 'description' in df.columns:
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(df['description'].astype(str))
    X_text = tokenizer.texts_to_sequences(df['description'].astype(str))
    X_text = pad_sequences(X_text, maxlen=100)
else:
    X_text = np.zeros((len(df), 100))

# Extract numerical and categorical features
excluded_cols = ['description', target]
feature_cols = [col for col in df.columns if col not in excluded_cols]
X_tabular = df[feature_cols].values

# Normalize numerical features
scaler = StandardScaler()
X_tabular = scaler.fit_transform(X_tabular)

# Split dataset
X_text_train, X_text_test, X_tab_train, X_tab_test, y_train, y_test = train_test_split(
    X_text, X_tabular, y_binary, test_size=0.2, random_state=42
)

# Model architecture
text_input = Input(shape=(100,), name='text_input')
text_embedding = Embedding(input_dim=5000, output_dim=128, input_length=100)(text_input)
text_lstm = LSTM(64, return_sequences=False)(text_embedding)
text_dense = Dense(32, activation='relu')(text_lstm)

# Tabular input
tabular_input = Input(shape=(X_tab_train.shape[1],), name='tabular_input')
tabular_dense = Dense(64, activation='relu')(tabular_input)
tabular_dense = BatchNormalization()(tabular_dense)
tabular_dense = Dropout(0.3)(tabular_dense)

# Combine both inputs
merged = Concatenate()([text_dense, tabular_dense])
final_dense = Dense(64, activation='relu')(merged)
final_dense = Dropout(0.3)(final_dense)
output = Dense(1, activation='sigmoid', name='output')(final_dense)

# Compile model
model = Model(inputs=[text_input, tabular_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(
    [X_text_train, X_tab_train], y_train,
    validation_data=([X_text_test, X_tab_test], y_test),
    epochs=20, batch_size=32
)

# Save model
model.save('/content/rnn_mlp_model1.h5')
print('Model saved!')

# Evaluate model
y_pred_probs = model.predict([X_text_test, X_tab_test])
y_pred = (y_pred_probs >= 0.5).astype(int)

mse = mean_squared_error(y_test, y_pred_probs)
mae = mean_absolute_error(y_test, y_pred_probs)
r2 = r2_score(y_test, y_pred_probs)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')




Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 35ms/step - accuracy: 0.4799 - loss: 0.9752 - val_accuracy: 0.7910 - val_loss: 0.6436
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8811 - loss: 0.3598 - val_accuracy: 0.7910 - val_loss: 0.5031
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.8635 - loss: 0.3761 - val_accuracy: 0.7910 - val_loss: 0.5323
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8815 - loss: 0.3528 - val_accuracy: 0.7910 - val_loss: 0.5203
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9017 - loss: 0.3331 - val_accuracy: 0.7910 - val_loss: 0.4942
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.8903 - loss: 0.3088 - val_accuracy: 0.7910 - val_loss: 0.4895
Epoch 7/20
[1m25/25[0m [32m━━━━



Model saved!
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Mean Squared Error: 0.1532512903213501
Mean Absolute Error: 0.23561657965183258
R^2 Score: 0.07285046577453613
Accuracy: 0.7860696517412935
Precision: 0.79
Recall: 0.9937106918238994
F1 Score: 0.8802228412256268


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load dataset
dataset_path = '/content/_camel.csv'
df = pd.read_csv(dataset_path, sep=',', encoding='ISO-8859-1', header=0)

# Show dataset info and preview
print('Dataset Info:')
print(df.info())
print('\nFirst 5 rows:')
print(df.head())

# Handle missing values
df.replace(['null', 'NULL', 'NaN', '', ' '], np.nan, inplace=True)

# Fill missing numeric columns with median
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.median()))

# Fill missing categorical columns with mode
categorical_cols = df.select_dtypes(include=[object]).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]))

# Convert datetime columns
datetime_cols = ['created', 'resolved', 'assigned']
for col in datetime_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Feature engineering

# 1. Time to resolve (hours)
df['time_to_resolve'] = (df['resolved'] - df['created']).dt.total_seconds() / 3600

# 2. Time to assign (hours)
if 'assigned' in df.columns:
    df['time_to_assign'] = (df['assigned'] - df['created']).dt.total_seconds() / 3600
    df['time_to_assign'] = df['time_to_assign'].fillna(0)

# 3. Time to fix (80% of time to resolve)
df['time_to_fix'] = df['time_to_resolve'] * 0.8

# 4. Description word count
if 'description' in df.columns:
    df['description_word_count'] = df['description'].astype(str).str.split().str.len().fillna(0)
else:
    df['description_word_count'] = 0

# 5. Number of commenters
if 'commenter' in df.columns:
    df['commenter_count'] = df['commenter'].astype(str).str.split(',').str.len().fillna(0)
else:
    df['commenter_count'] = 0

# 6. Sum of votes and watches
for col in ['votes', 'watches']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
df['votes_watches_sum'] = df[['votes', 'watches']].sum(axis=1)

# 7. Binary flags for critical labels
critical_labels = ['Surprising', 'Dormant', 'Blocker', 'Security', 'Performance', 'Breakage']
for label in critical_labels:
    if label in df.columns:
        df[f'is_{label.lower()}'] = (df[label].notnull()).astype(int)
    else:
        df[f'is_{label.lower()}'] = 0

# 8. Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = ['type', 'status', 'resolution', 'component', 'priority', 'reporter', 'assignee']
for col in categorical_cols:
    if col in df.columns:
        df[col] = label_encoder.fit_transform(df[col].astype(str))

# 9. Interaction term: Priority * Time to resolve
if 'priority' in df.columns:
    df['priority_time_interaction'] = df['priority'] * df['time_to_resolve']
else:
    df['priority_time_interaction'] = 0

# Save cleaned and engineered dataset
cleaned_engineered_data_path = 'cleaned_engineered_dataset2.csv'
df.to_csv(cleaned_engineered_data_path, index=False)
print(f'Cleaned and engineered dataset saved at: {cleaned_engineered_data_path}')

# Show sample of engineered features
print('\nSample of engineered features:')
print(df[['time_to_resolve', 'time_to_assign', 'time_to_fix', 'description_word_count',
          'commenter_count', 'votes_watches_sum', 'priority_time_interaction']].head())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   ï»¿issue_id        1000 non-null   int64  
 1   type               1000 non-null   object 
 2   status             1000 non-null   object 
 3   resolution         1000 non-null   object 
 4   component          877 non-null    object 
 5   priority           1000 non-null   object 
 6   reporter           1000 non-null   object 
 7   created            1000 non-null   object 
 8   assigned           977 non-null    object 
 9   assignee           1000 non-null   object 
 10  resolved           1000 non-null   object 
 11  created.1          1000 non-null   float64
 12  assigned.1         977 non-null    float64
 13  summary            1000 non-null   object 
 14  description        1000 non-null   object 
 15  affected_version   1000 non-null   object 
 16  fixed_versi

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, BatchNormalization, Flatten, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

# Load dataset
dataset_path = 'cleaned_engineered_dataset2.csv'
df = pd.read_csv(dataset_path)

# Handling missing values (ensure no NaNs)
df.fillna(0, inplace=True)

# Convert categorical features to numerical using LabelEncoder
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Define target variable
target = 'priority'
y = df[target].values

# Convert target to binary classification (e.g., median split)
median_time = np.median(y)
y_binary = (y >= median_time).astype(int)

# Text processing for 'description' column (if available)
if 'description' in df.columns:
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(df['description'].astype(str))
    X_text = tokenizer.texts_to_sequences(df['description'].astype(str))
    X_text = pad_sequences(X_text, maxlen=100)
else:
    X_text = np.zeros((len(df), 100))

# Extract numerical and categorical features
excluded_cols = ['description', target]
feature_cols = [col for col in df.columns if col not in excluded_cols]
X_tabular = df[feature_cols].values

# Normalize numerical features
scaler = StandardScaler()
X_tabular = scaler.fit_transform(X_tabular)

# Split dataset
X_text_train, X_text_test, X_tab_train, X_tab_test, y_train, y_test = train_test_split(
    X_text, X_tabular, y_binary, test_size=0.2, random_state=42
)

# Model architecture
text_input = Input(shape=(100,), name='text_input')
text_embedding = Embedding(input_dim=5000, output_dim=128, input_length=100)(text_input)
text_lstm = LSTM(64, return_sequences=False)(text_embedding)
text_dense = Dense(32, activation='relu')(text_lstm)

# Tabular input
tabular_input = Input(shape=(X_tab_train.shape[1],), name='tabular_input')
tabular_dense = Dense(64, activation='relu')(tabular_input)
tabular_dense = BatchNormalization()(tabular_dense)
tabular_dense = Dropout(0.3)(tabular_dense)

# Combine both inputs
merged = Concatenate()([text_dense, tabular_dense])
final_dense = Dense(64, activation='relu')(merged)
final_dense = Dropout(0.3)(final_dense)
output = Dense(1, activation='sigmoid', name='output')(final_dense)

# Compile model
model = Model(inputs=[text_input, tabular_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(
    [X_text_train, X_tab_train], y_train,
    validation_data=([X_text_test, X_tab_test], y_test),
    epochs=20, batch_size=32
)

# Save model
model.save('/content/rnn_mlp_model1.h5')
print('Model saved!')

# Evaluate model
y_pred_probs = model.predict([X_text_test, X_tab_test])
y_pred = (y_pred_probs >= 0.5).astype(int)

mse = mean_squared_error(y_test, y_pred_probs)
mae = mean_absolute_error(y_test, y_pred_probs)
r2 = r2_score(y_test, y_pred_probs)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Epoch 1/20




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.4799 - loss: 0.9161 - val_accuracy: 0.9950 - val_loss: 0.0772
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9568 - loss: 0.1947 - val_accuracy: 0.9950 - val_loss: 0.0371
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9710 - loss: 0.1274 - val_accuracy: 0.9950 - val_loss: 0.0473
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9620 - loss: 0.1611 - val_accuracy: 0.9950 - val_loss: 0.0469
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9720 - loss: 0.1264 - val_accuracy: 0.9950 - val_loss: 0.0428
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.9667 - loss: 0.1261 - val_accuracy: 0.9950 - val_loss: 0.0443
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━



Model saved!
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Mean Squared Error: 0.006395124364644289
Mean Absolute Error: 0.022321077063679695
R^2 Score: -0.28545236587524414
Accuracy: 0.995
Precision: 0.995
Recall: 1.0
F1 Score: 0.9974937343358395


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load dataset
dataset_path = '/content/_derby.csv'
df = pd.read_csv(dataset_path, sep=',', encoding='ISO-8859-1', header=0)

# Show dataset info and preview
print('Dataset Info:')
print(df.info())
print('\nFirst 5 rows:')
print(df.head())

# Handle missing values
df.replace(['null', 'NULL', 'NaN', '', ' '], np.nan, inplace=True)

# Fill missing numeric columns with median
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.median()))

# Fill missing categorical columns with mode
categorical_cols = df.select_dtypes(include=[object]).columns
df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0]))

# Convert datetime columns
datetime_cols = ['created', 'resolved', 'assigned']
for col in datetime_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Feature engineering

# 1. Time to resolve (hours)
df['time_to_resolve'] = (df['resolved'] - df['created']).dt.total_seconds() / 3600

# 2. Time to assign (hours)
if 'assigned' in df.columns:
    df['time_to_assign'] = (df['assigned'] - df['created']).dt.total_seconds() / 3600
    df['time_to_assign'] = df['time_to_assign'].fillna(0)

# 3. Time to fix (80% of time to resolve)
df['time_to_fix'] = df['time_to_resolve'] * 0.8

# 4. Description word count
if 'description' in df.columns:
    df['description_word_count'] = df['description'].astype(str).str.split().str.len().fillna(0)
else:
    df['description_word_count'] = 0

# 5. Number of commenters
if 'commenter' in df.columns:
    df['commenter_count'] = df['commenter'].astype(str).str.split(',').str.len().fillna(0)
else:
    df['commenter_count'] = 0

# 6. Sum of votes and watches
for col in ['votes', 'watches']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
df['votes_watches_sum'] = df[['votes', 'watches']].sum(axis=1)

# 7. Binary flags for critical labels
critical_labels = ['Surprising', 'Dormant', 'Blocker', 'Security', 'Performance', 'Breakage']
for label in critical_labels:
    if label in df.columns:
        df[f'is_{label.lower()}'] = (df[label].notnull()).astype(int)
    else:
        df[f'is_{label.lower()}'] = 0

# 8. Encode categorical variables
label_encoder = LabelEncoder()
categorical_cols = ['type', 'status', 'resolution', 'component', 'priority', 'reporter', 'assignee']
for col in categorical_cols:
    if col in df.columns:
        df[col] = label_encoder.fit_transform(df[col].astype(str))

# 9. Interaction term: Priority * Time to resolve
if 'priority' in df.columns:
    df['priority_time_interaction'] = df['priority'] * df['time_to_resolve']
else:
    df['priority_time_interaction'] = 0

# Save cleaned and engineered dataset
cleaned_engineered_data_path = 'cleaned_engineered_dataset3.csv'
df.to_csv(cleaned_engineered_data_path, index=False)
print(f'Cleaned and engineered dataset saved at: {cleaned_engineered_data_path}')

# Show sample of engineered features
print('\nSample of engineered features:')
print(df[['time_to_resolve', 'time_to_assign', 'time_to_fix', 'description_word_count',
          'commenter_count', 'votes_watches_sum', 'priority_time_interaction']].head())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ï»¿IssueId                1000 non-null   int64  
 1   type                      1000 non-null   object 
 2   status                    1000 non-null   object 
 3   resolution                1000 non-null   object 
 4   component                 989 non-null    object 
 5   priority                  1000 non-null   object 
 6   reporter                  1000 non-null   object 
 7   created                   1000 non-null   object 
 8   assigned                  972 non-null    object 
 9   assignee                  1000 non-null   object 
 10  resolved                  1000 non-null   object 
 11  fixingTimeFromCreate      1000 non-null   float64
 12  fixingTimeFromAssignment  972 non-null    object 
 13  summary                   1000 non-null   object 


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, BatchNormalization, Flatten, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

# Load dataset
dataset_path = 'cleaned_engineered_dataset3.csv'
df = pd.read_csv(dataset_path)

# Handling missing values (ensure no NaNs)
df.fillna(0, inplace=True)

# Convert categorical features to numerical using LabelEncoder
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Define target variable
target = 'priority'
y = df[target].values

# Convert target to binary classification (e.g., median split)
median_time = np.median(y)
y_binary = (y >= median_time).astype(int)

# Text processing for 'description' column (if available)
if 'description' in df.columns:
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(df['description'].astype(str))
    X_text = tokenizer.texts_to_sequences(df['description'].astype(str))
    X_text = pad_sequences(X_text, maxlen=100)
else:
    X_text = np.zeros((len(df), 100))

# Extract numerical and categorical features
excluded_cols = ['description', target]
feature_cols = [col for col in df.columns if col not in excluded_cols]
X_tabular = df[feature_cols].values

# Normalize numerical features
scaler = StandardScaler()
X_tabular = scaler.fit_transform(X_tabular)

# Split dataset
X_text_train, X_text_test, X_tab_train, X_tab_test, y_train, y_test = train_test_split(
    X_text, X_tabular, y_binary, test_size=0.2, random_state=42
)

# Model architecture
text_input = Input(shape=(100,), name='text_input')
text_embedding = Embedding(input_dim=5000, output_dim=128, input_length=100)(text_input)
text_lstm = LSTM(64, return_sequences=False)(text_embedding)
text_dense = Dense(32, activation='relu')(text_lstm)

# Tabular input
tabular_input = Input(shape=(X_tab_train.shape[1],), name='tabular_input')
tabular_dense = Dense(64, activation='relu')(tabular_input)
tabular_dense = BatchNormalization()(tabular_dense)
tabular_dense = Dropout(0.3)(tabular_dense)

# Combine both inputs
merged = Concatenate()([text_dense, tabular_dense])
final_dense = Dense(64, activation='relu')(merged)
final_dense = Dropout(0.3)(final_dense)
output = Dense(1, activation='sigmoid', name='output')(final_dense)

# Compile model
model = Model(inputs=[text_input, tabular_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(
    [X_text_train, X_tab_train], y_train,
    validation_data=([X_text_test, X_tab_test], y_test),
    epochs=20, batch_size=32
)

# Save model
model.save('/content/rnn_mlp_model1.h5')
print('Model saved!')

# Evaluate model
y_pred_probs = model.predict([X_text_test, X_tab_test])
y_pred = (y_pred_probs >= 0.5).astype(int)

mse = mean_squared_error(y_test, y_pred_probs)
mae = mean_absolute_error(y_test, y_pred_probs)
r2 = r2_score(y_test, y_pred_probs)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R^2 Score: {r2}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Epoch 1/20




[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 26ms/step - accuracy: 0.7473 - loss: 0.5075 - val_accuracy: 0.9800 - val_loss: 0.0969
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.9763 - loss: 0.1300 - val_accuracy: 0.9800 - val_loss: 0.1003
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9685 - loss: 0.1359 - val_accuracy: 0.9800 - val_loss: 0.0924
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9584 - loss: 0.1576 - val_accuracy: 0.9800 - val_loss: 0.0927
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9677 - loss: 0.1259 - val_accuracy: 0.9800 - val_loss: 0.0918
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9598 - loss: 0.1342 - val_accuracy: 0.9800 - val_loss: 0.0889
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━



Model saved!




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Mean Squared Error: 0.02087937854230404
Mean Absolute Error: 0.04594748839735985
R^2 Score: -0.06527435779571533
Accuracy: 0.975
Precision: 0.9798994974874372
Recall: 0.9948979591836735
F1 Score: 0.9873417721518988
