In [1]:
import pandas as pd
import numpy as np
from scipy import stats

data = pd.read_csv('dataset.csv')
print(data.head())
valid_states = {
    'Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Goa', 'Gujarat', 
    'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand', 'Karnataka', 'Kerala', 
    'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha', 
    'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura', 'Uttar Pradesh', 
    'Uttarakhand', 'West Bengal', 'Puducherry', 'Delhi', 'Lakshadweep', 'Andaman and Nicobar Islands'
}

# Remove rows where 'state_name' is not in the set of valid states
data = data[data['state_name'].isin(valid_states)]
data.head(10)

  inout_travelling operator network_type  rating   calldrop_category  \
0           Indoor     RJio           4G       1  Poor Voice Quality   
1           Indoor     RJio           4G       1  Poor Voice Quality   
2           Indoor       VI           4G       1  Poor Voice Quality   
3           Indoor       VI           4G       1  Poor Voice Quality   
4       Travelling       VI           4G       4        Satisfactory   

    latitude  longitude     state_name  month  year  
0  26.687119  82.171533  Uttar Pradesh      7  2022  
1  26.687119  82.171533  Uttar Pradesh      7  2022  
2  -1.000000  -1.000000            NaN      7  2022  
3  -1.000000  -1.000000            NaN      7  2022  
4  21.123036  79.067904    Maharashtra      7  2022  


Unnamed: 0,inout_travelling,operator,network_type,rating,calldrop_category,latitude,longitude,state_name,month,year
0,Indoor,RJio,4G,1,Poor Voice Quality,26.687119,82.171533,Uttar Pradesh,7,2022
1,Indoor,RJio,4G,1,Poor Voice Quality,26.687119,82.171533,Uttar Pradesh,7,2022
4,Travelling,VI,4G,4,Satisfactory,21.123036,79.067904,Maharashtra,7,2022
8,Indoor,RJio,4G,3,Satisfactory,19.225397,72.859307,Maharashtra,7,2022
10,Outdoor,VI,4G,4,Satisfactory,25.525933,87.03288,Bihar,7,2022
12,Outdoor,VI,Unknown,5,Satisfactory,25.510683,86.937457,Bihar,7,2022
14,Indoor,RJio,4G,5,Satisfactory,15.87268,74.507458,Karnataka,7,2022
19,Indoor,Airtel,4G,5,Satisfactory,13.002983,77.711753,Karnataka,7,2022
20,Indoor,RJio,4G,5,Satisfactory,12.956595,77.489688,Karnataka,7,2022
21,Indoor,RJio,4G,5,Satisfactory,12.956594,77.489697,Karnataka,7,2022


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54222 entries, 0 to 104332
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   inout_travelling   54222 non-null  object 
 1   operator           54222 non-null  object 
 2   network_type       54222 non-null  object 
 3   rating             54222 non-null  int64  
 4   calldrop_category  54222 non-null  object 
 5   latitude           54222 non-null  float64
 6   longitude          54222 non-null  float64
 7   state_name         54222 non-null  object 
 8   month              54222 non-null  int64  
 9   year               54222 non-null  int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 4.6+ MB


In [3]:
print(data.isnull().sum())

inout_travelling     0
operator             0
network_type         0
rating               0
calldrop_category    0
latitude             0
longitude            0
state_name           0
month                0
year                 0
dtype: int64


In [4]:
data_cleaned = data.dropna(subset=['calldrop_category'])
data_cleaned['state_name'].fillna('Unknown', inplace=True)
data_cleaned['network_type'].fillna('Unknown', inplace = True)

data_cleaned.replace('Unknown', np.nan, inplace=True)

# Drop rows that contain any NaN values
data_cleaned.dropna(inplace=True)
data_cleaned

Unnamed: 0,inout_travelling,operator,network_type,rating,calldrop_category,latitude,longitude,state_name,month,year
0,Indoor,RJio,4G,1,Poor Voice Quality,26.687119,82.171533,Uttar Pradesh,7,2022
1,Indoor,RJio,4G,1,Poor Voice Quality,26.687119,82.171533,Uttar Pradesh,7,2022
4,Travelling,VI,4G,4,Satisfactory,21.123036,79.067904,Maharashtra,7,2022
8,Indoor,RJio,4G,3,Satisfactory,19.225397,72.859307,Maharashtra,7,2022
10,Outdoor,VI,4G,4,Satisfactory,25.525933,87.032880,Bihar,7,2022
...,...,...,...,...,...,...,...,...,...,...
104326,Travelling,Airtel,4G,2,Call Dropped,16.481119,81.150113,Andhra Pradesh,11,2023
104327,Indoor,RJio,4G,5,Satisfactory,12.956510,77.489557,Karnataka,11,2023
104328,Outdoor,VI,4G,1,Poor Voice Quality,21.132335,79.068327,Maharashtra,11,2023
104329,Indoor,RJio,4G,5,Satisfactory,12.958151,77.484752,Karnataka,11,2023


In [5]:
data_cleaned.isnull().sum()
print(set(data_cleaned['state_name']))
print(set(data_cleaned['operator']))
print(set(data_cleaned['inout_travelling']))
print(set(data_cleaned['network_type']))
print(set(data_cleaned['year']))
print(set(data_cleaned['month']))
print(set(data_cleaned['rating']))


{'Punjab', 'Uttarakhand', 'Assam', 'Maharashtra', 'West Bengal', 'Karnataka', 'Gujarat', 'Manipur', 'Tripura', 'Bihar', 'Odisha', 'Goa', 'Mizoram', 'Jammu and Kashmir', 'Uttar Pradesh', 'Andhra Pradesh', 'Telangana', 'Delhi', 'Rajasthan', 'Haryana', 'Chhattisgarh', 'Jharkhand', 'Madhya Pradesh', 'Himachal Pradesh', 'Puducherry', 'Meghalaya', 'Tamil Nadu', 'Kerala', 'Arunachal Pradesh', 'Nagaland'}
{'Idea', 'VI', 'MTNL', 'Airtel', 'Vodafone', 'RJio', 'BSNL'}
{'Outdoor', 'Indoor', 'Travelling'}
{'2G', '3G', '4G'}
{2024, 2021, 2022, 2023}
{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}
{1, 2, 3, 4, 5}


In [6]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
categorical_columns = ['inout_travelling', 'operator', 'network_type', 'state_name']

for col in categorical_columns:
    le = LabelEncoder()
    data_cleaned[col] = le.fit_transform(data_cleaned[col])
    label_encoders[col] = le
print(set(data_cleaned['inout_travelling']))
print(set(data_cleaned['operator']))
print(set(data_cleaned['network_type']))
print(set(data_cleaned['state_name']))
data_cleaned.head()

{0, 1, 2}
{0, 1, 2, 3, 4, 5, 6}
{0, 1, 2}
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29}


Unnamed: 0,inout_travelling,operator,network_type,rating,calldrop_category,latitude,longitude,state_name,month,year
0,0,4,2,1,Poor Voice Quality,26.687119,82.171533,27,7,2022
1,0,4,2,1,Poor Voice Quality,26.687119,82.171533,27,7,2022
4,2,5,2,4,Satisfactory,21.123036,79.067904,15,7,2022
8,0,4,2,3,Satisfactory,19.225397,72.859307,15,7,2022
10,1,5,2,4,Satisfactory,25.525933,87.03288,3,7,2022


In [7]:
X = data_cleaned[['operator', 'network_type', 'state_name', 'inout_travelling']]
y = data_cleaned['calldrop_category']  # Example target variable

In [8]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

print("Model training complete!")

Model training complete!


In [10]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)
y_train_pred = rf_model.predict(X_train)

In [11]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Accuracy on training data: {train_accuracy * 100:.2f}%")

print(f"Accuracy test data: {accuracy * 100:.2f}%")

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy on training data: 83.88%
Accuracy test data: 82.78%
Classification Report:
                     precision    recall  f1-score   support

      Call Dropped       0.85      0.56      0.68      1568
Poor Voice Quality       0.67      0.33      0.44      2085
      Satisfactory       0.84      0.97      0.90      9888

          accuracy                           0.83     13541
         macro avg       0.79      0.62      0.67     13541
      weighted avg       0.82      0.83      0.81     13541

Confusion Matrix:
 [[ 880  116  572]
 [ 126  689 1270]
 [  32  216 9640]]
