### Import libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from catboost import CatBoostClassifier

### Step 1: Load Dataset

In [2]:
data = pd.read_csv('train_data.csv')



### Display Top 5 Rows of The Dataset

In [3]:
data.head()

Unnamed: 0,Region,Area,Branch,Branch Code,Gender,Age,Age Level,Education,Marital Status,house_ownership,...,Expenses,Saving Amount,Monthly_Saving,who_will_earn,Social behavior,Loan Amount,Inst Months,Inst Amnt,Activity,Tax
0,Nowshera,Nowshera,Akora Khattak,14303,M,37,Middle Age (30-45),Matric,Married,Owned,...,30000,20000,At_home,Self,Good,300000,60,6123,Renovation / Up-gradation,Tax paid
1,Kasur,Pattoki,Manga Mandi,1721,F,40,Middle Age (30-45),Illiterate,Married,Rented,...,39000,11000,No_saving,Self,Good,500000,60,10204,New Construction,Tax not paid
2,Mianwali,Quidabad,Chak No 8,3409,M,37,Middle Age (30-45),Intermediate,Married,Owned,...,22500,9500,At_home,Self,Good,500000,60,10204,New Construction,Tax paid
3,Bahawalnagar,Chishtian,Dahranwala,212,M,40,Middle Age (30-45),Illiterate,Married,Rented,...,19500,3500,No_saving,Other,Good,350000,60,7143,New Construction,Tax paid
4,Multan,Rasheed Abad,Farooq Pura,2220,M,49,Older (46-Above),Matric,Married,Owned,...,30000,20000,At_home,Self,Good,500000,60,10204,New Construction,Tax paid


### Check Last 5 Rows of The Dataset

In [4]:
data.tail()

Unnamed: 0,Region,Area,Branch,Branch Code,Gender,Age,Age Level,Education,Marital Status,house_ownership,...,Expenses,Saving Amount,Monthly_Saving,who_will_earn,Social behavior,Loan Amount,Inst Months,Inst Amnt,Activity,Tax
33261,Bahawalnagar,Bahawalnagar,Macleod Ganj,210,F,43,Middle Age (30-45),Illiterate,Widower,Rented,...,29000,16000,At_home,Self,Good,500000,60,10204,New Construction,Tax paid
33262,Kasur,Theeng More,Landay,1823,F,31,Middle Age (30-45),Masters,Married,Owned,...,22000,20000,At_home,Self,Good,400000,60,8163,New Construction,Tax not paid
33263,Attock,Fateh Jang,Qutbal,2817,M,28,Young (19-29),Primary,Single,Rented,...,26000,24000,At_home,Self,Good,500000,60,10204,New Construction,Tax not paid
33264,FATA,Bajaur,Khar,8101,M,37,Middle Age (30-45),Middle,Married,Owned,...,23500,28000,Bank_saving,Self,Good,500000,60,10204,New Construction,Tax paid
33265,Sahiwal,Mian Channu,Chichawatni,2603,M,33,Middle Age (30-45),Intermediate,Married,Rented,...,26000,22000,At_home,Self,Very_Bad,800000,60,16327,New Construction,Tax paid


### Find Shape of Our Dataset (Number of Rows And Number of Columns)

In [5]:
data.shape

(33266, 24)

In [6]:
print("Number of Rows",data.shape[0])
print("Number of Columns",data.shape[1])

Number of Rows 33266
Number of Columns 24


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33266 entries, 0 to 33265
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Region                33266 non-null  object
 1   Area                  33266 non-null  object
 2   Branch                33266 non-null  object
 3   Branch Code           33266 non-null  int64 
 4   Gender                33266 non-null  object
 5   Age                   33266 non-null  int64 
 6   Age Level             33266 non-null  object
 7   Education             33266 non-null  object
 8   Marital Status        33266 non-null  object
 9   house_ownership       33266 non-null  object
 10  total_family_members  33266 non-null  int64 
 11  no_of_earning_hands   33266 non-null  int64 
 12  Source of Incom       33266 non-null  object
 13  Incom                 33266 non-null  object
 14  Expenses              33266 non-null  object
 15  Saving Amount         33266 non-null

### Check Null Values In The Dataset

In [8]:
data.isnull().sum()

Region                  0
Area                    0
Branch                  0
Branch Code             0
Gender                  0
Age                     0
Age Level               0
Education               0
Marital Status          0
house_ownership         0
total_family_members    0
no_of_earning_hands     0
Source of Incom         0
Incom                   0
Expenses                0
Saving Amount           0
Monthly_Saving          0
who_will_earn           0
Social behavior         0
Loan Amount             0
Inst Months             0
Inst Amnt               0
Activity                0
Tax                     0
dtype: int64

In [9]:
data['Region'].unique()

array(['Nowshera', 'Kasur', 'Mianwali', 'Bahawalnagar', 'Multan',
       'Sargodha', 'Chakwal', 'AJ&K', 'Bahawalpur', 'Abbottabad',
       'Sukkur', 'Karachi', 'DG Khan', 'Quetta', 'Gujranwala', 'FATA',
       'Kohat', 'Faisalabad', 'Jhang', 'Sheikhupura', 'Attock',
       'Rawalpindi', 'Jhelum', 'Lahore', 'Gilgit Baltistan', 'Sahiwal'],
      dtype=object)

In [10]:
data['Area'].unique()

array(['Nowshera', 'Pattoki', 'Quidabad', 'Chishtian', 'Rasheed Abad',
       'Mianwali', 'Jahanian', 'Sargodha', 'Talagang', 'Abbaspur',
       'Lodhran(Bahawalpur2)', 'Abbottabad', 'Kot Momin', 'Hyderabad',
       'Korangi', 'Muzaffarabad', 'Alipur', 'Quetta', 'Multan',
       'D.G Khan', 'Rawalakot', 'Theeng More', 'Sialkot', 'Mohmand',
       'Bhakkar', 'Jhelum Valley', 'Kohat', 'Mansehra', 'Swabi',
       'Faisalabad', 'Rajana', 'Battagram', 'Chitral', 'Jhang', 'Karachi',
       'Khuzdar', 'Pichnand', 'Kamoke', 'Toba Tek Singh', 'Bahawalnagar',
       'Narowal', 'Jand', 'Rahim yar Khan', 'Hafizabad', 'Bahawalpur',
       'Tench Bhata', 'Bhera', 'Sukkur', 'Mandi Bahauddin', 'Gareebabad',
       'Thokar', 'Kasur', 'Ghizer', 'Sangla Hill', 'Okara', 'Gujrat',
       'Kala Bagh', 'Rajanpur', 'Kotli', 'Samundari', 'Peshawar',
       'Bajaur', 'Pind Dadan Khan', 'Rangeel Pur', 'Thatha', 'Nishtar',
       'Salamatpura', 'Vehari', 'Chiniot', 'Jacob Abad', 'Mian Channu',
       'Gujranwala'

In [11]:
data['Branch'].unique()

array(['Akora Khattak', 'Manga Mandi', 'Chak No 8', 'Dahranwala',
       'Farooq Pura', 'Mianwali-1', 'Dunyapur', 'Sargodha-2',
       'Jhatla / Chinji', 'Haroonabad-1', 'Baloch', 'Lodhran', 'Havelian',
       'Moazzam Abad', 'Digri', 'Shireen Jinnah', 'Muzaffarabad-2',
       'Shaher Sultan', 'Nawan Killi', 'Shuja Abad', 'D.G Khan-1',
       'Khai Gala', 'Rati Pindi', 'Daska', 'Yaka Ghund', 'Mankera',
       'Dhirkot', 'Kalaya', 'Mansehra-1', 'Chunian', 'Shewa',
       'Canal Road Faisalabad', 'Hajji Chand', 'Pir Mahal', 'Battagram',
       'Drosh', 'Athara Hazari', 'Baldia Town', 'Mastung', 'Wanhar',
       'Harnoli', 'Muridke', 'Toba Tek Singh', 'Dunga Bunga', 'Pasrur-1',
       'Kot Momin', 'Ikhlas', 'Dhudhi Wala', 'Changa Manga',
       'Sadiq Abad-1', 'Kot Chajji', 'Hafizabad', 'Qadirpur Ran',
       'Yazman Mandi', 'Bahawalnagar-3', 'Garhi Dupatta', 'Tench Bhata',
       'Miani', 'Sukkur-1', 'Gojra-Mandi', 'Sardar Market', 'Dhamthal',
       'Sher Shah Colony', 'Bahawalnagar-1',

In [12]:
data['Branch Code'].unique()

array([14303,  1721,  3409,   212,  2220,  3501,  2402,  3204,  2914,
         204, 50504,  2401, 12404,  3213, 34202, 35106, 50202,   611,
        5102,  2206,   401, 50605,  1827,  1601,  8202,  3304, 50402,
        8501, 12201,  1801, 13204,   840,  6414,  1004, 12301, 11102,
         905, 35111, 47501,  2931,  3520,  1103,  1002,   206,  1603,
        3209,  2829,   824,  1803,   302,  2832,  1301,  2208,   106,
         213, 50203,  3102,  3201,  5001,  1410,  3140,  1504,  1744,
         202,   814, 32301,  1820,  2404, 51201,  2107,  2505,  1203,
        3519,  2930,   702, 50901,  1806, 35102, 45301, 50501,  1819,
        3308,   812,  1709, 50205, 14203,  8105,  3015,  2207,  3118,
       33802,  3405,  1115,   103,  1825,  1725, 50201,  3306,  1736,
        2814,  3206,   208,  3516,  1828,  1821,  1826,  2701,  3010,
        1608,  1302,  1719, 31101, 11201, 14311,  2306,  2217,  1818,
        1405,  1101, 14306, 33801, 14101,  8401,   110, 16102, 35104,
        2209,  2803,

In [13]:
data['Education'].unique()

array(['Matric', 'Illiterate', 'Intermediate', 'Primary', 'Bachelor',
       'Masters', 'Middle', 'Mphil', 'Phd'], dtype=object)

In [14]:
data['Age'].unique()

array([37, 40, 49, 47, 45, 42, 52, 44, 33, 35, 20, 39, 46, 28, 50, 53, 48,
       34, 24, 51, 36, 54, 56, 23, 31, 59, 55, 27, 29, 41, 38, 57, 21, 32,
       26, 25, 43, 58, 30, 22, 60, 19, 61])

In [15]:
data['Gender'].unique()

array(['M', 'F'], dtype=object)

In [16]:
data['Inst Months'].unique()

array([60, 48, 36, 30, 54, 42, 24, 12, 21])

In [17]:
data['Loan Amount'].unique()

array(['300,000', '500,000', '350,000', '750,000', '800,000', '700,000',
       '1,000,000', '375,000', '400,000', '600,000', '450,000', '900,000',
       '460,000', '475,000', '420,000', '470,000', '330,000', '250,000',
       '640,000', '480,000', '490,000', '430,000', '485,000', '200,000',
       '440,000', '650,000', '280,000', '550,000', '395,000', '150,000',
       '380,000', '320,000', '670,000', '425,000', '438,000', '685,000',
       '410,000', '340,000', '396,000', '283,500', '240,000', '100,000',
       '850,000', '680,000', '225,000', '175,000', '245,000', '476,636',
       '390,000', '125,000', '710,000', '366,000', '285,000', '476,396',
       '476,000', '275,000', '415,000', '205,000', '494,000', '434,500',
       '310,000', '220,000', '435,500', '230,000', '465,000', '370,000',
       '298,000', '675,000', '360,000', '434,000', '120,000', '630,000',
       '338,000', '421,500', '187,500', '875,000', '491,244', '448,000',
       '385,000', '325,000', '422,000', '498,690'

In [18]:
data['Marital Status'].unique()

array(['Married', 'Single', 'Widow', 'Divorced', 'Widower'], dtype=object)

In [19]:
data['Saving Amount'].unique()

array(['20,000', '11,000', '9,500', '3,500', '15,000', '13,500', '45,000',
       '12,000', '14,000', '5,000', '10,000', '29,000', '22,000',
       '27,000', '16,500', '43,500', '16,000', '4,000', '25,000',
       '21,800', '7,000', '32,000', '11,500', '17,000', '14,500',
       '28,500', '21,500', '0', '21,000', '25,500', '23,000', '18,000',
       '35,000', '29,500', '13,000', '13,900', '30,000', '9,000',
       '28,000', '24,700', '24,000', '18,500', '27,500', '1,000', '1,500',
       '19,000', '8,500', '31,500', '15,500', '23,500', '6,000', '20,500',
       '8,000', '26,500', '6,500', '19,500', '41,000', '11,800', '44,000',
       '24,500', '31,000', '3,000', '40,000', '22,500', '4,500', '38,500',
       '40,500', '26,000', '33,000', '7,500', '37,000', '12,500',
       '17,500', '22,951', '43,800', '34,000', '5,500', '42,000',
       '56,500', '36,000', '48,500', '43,752', '2,000', '35,500',
       '15,400', '10,500', '39,000', '38,000', '46,000', '41,500',
       '24,800', '2,500'

In [20]:
data['Expenses'].unique()

array(['30,000', '39,000', '22,500', '19,500', '40,000', '46,500',
       '15,000', '50,000', '34,000', '56,000', '45,000', '25,000',
       '49,000', '31,000', '23,000', '58,000', '28,000', '28,500',
       '41,500', '36,000', '46,000', '35,000', '26,000', '14,200',
       '53,000', '24,000', '43,000', '33,000', '25,500', '48,000',
       '18,000', '16,500', '80,000', '29,000', '24,500', '29,500',
       '60,000', '38,000', '27,000', '20,000', '67,000', '16,000',
       '32,000', '37,100', '20,500', '39,500', '19,000', '16,800',
       '22,000', '30,300', '75,000', '18,500', '32,500', '48,500',
       '68,000', '31,500', '26,500', '30,500', '64,000', '37,000',
       '27,500', '35,500', '65,000', '21,500', '47,000', '15,500',
       '85,000', '78,000', '54,000', '21,000', '34,500', '70,000',
       '42,000', '47,500', '33,700', '51,500', '36,500', '63,000',
       '11,500', '52,000', '17,000', '33,500', '44,000', '9,000',
       '44,500', '100,000', '8,000', '63,850', '90,000', '41,00

In [21]:
data['Activity'].unique()

array(['Renovation / Up-gradation', 'New Construction', 'Addition'],
      dtype=object)

In [22]:
data.sample(5)

Unnamed: 0,Region,Area,Branch,Branch Code,Gender,Age,Age Level,Education,Marital Status,house_ownership,...,Expenses,Saving Amount,Monthly_Saving,who_will_earn,Social behavior,Loan Amount,Inst Months,Inst Amnt,Activity,Tax
764,Kasur,Theeng More,Hajji Chand,6414,M,52,Older (46-Above),Illiterate,Married,Owned,...,20000,20000,At_home,Self,Good,450000,60,9184,New Construction,Tax not paid
22383,Sheikhupura,Kamoke,Kamoke-2,1110,M,21,Young (19-29),Primary,Single,Owned,...,35000,0,No_saving,Self,Good,800000,60,16327,New Construction,Tax not paid
20061,FATA,Peshawar,Faqeerabad,14203,F,38,Middle Age (30-45),Matric,Married,Rented,...,31500,18500,At_home,Self,Very_Good,500000,60,10204,New Construction,Tax paid
18450,Quetta,Khuzdar,Mastung,47501,M,38,Middle Age (30-45),Intermediate,Married,Rented,...,15000,28000,At_home,Self,Good,1000000,60,20408,New Construction,Tax not paid
27526,Bahawalnagar,Chishtian,Chishtian-2,208,M,55,Older (46-Above),Matric,Married,Owned,...,36000,9000,At_home,Self,Very_Good,400000,60,8163,Renovation / Up-gradation,Tax paid


data['Gender'] = data['Gender'].map({'Male':1,'Female':0}).astype('int')
data['Married'] = data['Married'].map({'Yes':1,'No':0}).astype('int')
data['Education'] = data['Education'].map({'Graduate':1,'Not Graduate':0}).astype('int')
data['Self_Employed'] = data['Self_Employed'].map({'Yes':1,'No':0}).astype('int')
data['Property_Area'] = data['Property_Area'].map({'Rural':0,'Semiurban':2,'Urban':1}).astype('int')
data['Loan_Status'] = data['Loan_Status'].map({'Y':1,'N':0}).astype('int')


### Testing data set

In [23]:
import pandas as pd
from catboost import CatBoostClassifier

In [24]:
# Load the test CSV
test_data = pd.read_csv('test_data.csv')

In [25]:
# Show the first few rows
test_data.head()


Unnamed: 0,Region,Area,Branch,Branch Code,Gender,Age,Age Level,Education,Marital Status,house_ownership,...,Expenses,Saving Amount,Monthly_Saving,who_will_earn,Social behavior,Loan Amount,Inst Months,Inst Amnt,Activity,Tax
0,Nowshera,Swabi,Chota Lahore,13202,M,57,Older (46-Above),Matric,Married,Owned,...,39000,11000,At_home,Other,Good,500000,60,10204,New Construction,Tax not paid
1,Abbottabad,Battagram,Shinkiari,12208,M,44,Middle Age (30-45),Middle,Married,Owned,...,36000,24000,At_home,Self,Good,800000,60,16327,New Construction,Tax paid
2,Mianwali,Bhakkar,Panjgrain,3308,M,53,Older (46-Above),Primary,Married,Owned,...,22000,38000,At_home,Self,Good,250000,60,5102,New Construction,Tax paid
3,Gilgit Baltistan,Gilgit,Gilgit,51101,M,46,Older (46-Above),Primary,Married,Rented,...,27500,22500,Bank_saving,Self,Good,500000,60,10204,New Construction,Tax paid
4,Karachi,Korangi,Saudabad Malir,35108,M,40,Middle Age (30-45),Middle,Married,Owned,...,30000,20000,At_home,Self,Good,500000,60,10204,New Construction,Tax paid


### Handling The missing Values

In [26]:
test_data.isnull().sum()

Region                  0
Area                    0
Branch                  0
Branch Code             0
Gender                  0
Age                     0
Age Level               0
Education               0
Marital Status          0
house_ownership         0
total_family_members    0
no_of_earning_hands     0
Source of Incom         0
Incom                   0
Expenses                0
Saving Amount           0
Monthly_Saving          0
who_will_earn           0
Social behavior         0
Loan Amount             0
Inst Months             0
Inst Amnt               0
Activity                0
Tax                     0
dtype: int64

In [27]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14257 entries, 0 to 14256
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Region                14257 non-null  object
 1   Area                  14257 non-null  object
 2   Branch                14257 non-null  object
 3   Branch Code           14257 non-null  int64 
 4   Gender                14257 non-null  object
 5   Age                   14257 non-null  int64 
 6   Age Level             14257 non-null  object
 7   Education             14257 non-null  object
 8   Marital Status        14257 non-null  object
 9   house_ownership       14257 non-null  object
 10  total_family_members  14257 non-null  int64 
 11  no_of_earning_hands   14257 non-null  int64 
 12  Source of Incom       14257 non-null  object
 13  Incom                 14257 non-null  object
 14  Expenses              14257 non-null  object
 15  Saving Amount         14257 non-null


### Store Feature Matrix In X And Response (Target) In Vector y

In [28]:
data['Tax '].unique()

array(['Tax paid', 'Tax not paid'], dtype=object)

In [29]:
print(test_data.columns)

Index(['Region', 'Area', 'Branch', 'Branch Code', 'Gender', 'Age', 'Age Level',
       'Education', 'Marital Status', 'house_ownership',
       'total_family_members', 'no_of_earning_hands', 'Source of Incom',
       'Incom', 'Expenses', 'Saving Amount', 'Monthly_Saving', 'who_will_earn',
       'Social behavior', 'Loan Amount', 'Inst Months', 'Inst Amnt',
       'Activity', 'Tax '],
      dtype='object')


In [30]:
X = data.drop('Tax ',axis=1)
print(X)

             Region          Area         Branch  Branch Code Gender  Age  \
0          Nowshera      Nowshera  Akora Khattak        14303      M   37   
1             Kasur       Pattoki    Manga Mandi         1721      F   40   
2          Mianwali      Quidabad      Chak No 8         3409      M   37   
3      Bahawalnagar     Chishtian     Dahranwala          212      M   40   
4            Multan  Rasheed Abad    Farooq Pura         2220      M   49   
...             ...           ...            ...          ...    ...  ...   
33261  Bahawalnagar  Bahawalnagar   Macleod Ganj          210      F   43   
33262         Kasur   Theeng More         Landay         1823      F   31   
33263        Attock    Fateh Jang         Qutbal         2817      M   28   
33264          FATA        Bajaur           Khar         8101      M   37   
33265       Sahiwal   Mian Channu    Chichawatni         2603      M   33   

                Age Level     Education Marital Status house_ownership  ...

### Define Target and Features


In [31]:
target = 'Tax '
X = data.drop(columns=[target])
y = data[target]

In [32]:
print(X)

             Region          Area         Branch  Branch Code Gender  Age  \
0          Nowshera      Nowshera  Akora Khattak        14303      M   37   
1             Kasur       Pattoki    Manga Mandi         1721      F   40   
2          Mianwali      Quidabad      Chak No 8         3409      M   37   
3      Bahawalnagar     Chishtian     Dahranwala          212      M   40   
4            Multan  Rasheed Abad    Farooq Pura         2220      M   49   
...             ...           ...            ...          ...    ...  ...   
33261  Bahawalnagar  Bahawalnagar   Macleod Ganj          210      F   43   
33262         Kasur   Theeng More         Landay         1823      F   31   
33263        Attock    Fateh Jang         Qutbal         2817      M   28   
33264          FATA        Bajaur           Khar         8101      M   37   
33265       Sahiwal   Mian Channu    Chichawatni         2603      M   33   

                Age Level     Education Marital Status house_ownership  ...

In [33]:
print(y)

0            Tax paid
1        Tax not paid
2            Tax paid
3            Tax paid
4            Tax paid
             ...     
33261        Tax paid
33262    Tax not paid
33263    Tax not paid
33264        Tax paid
33265        Tax paid
Name: Tax , Length: 33266, dtype: object


### Identify Categorical Columns


In [34]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

In [35]:
print(f"Categorical Columns: {categorical_cols}")


Categorical Columns: ['Region', 'Area', 'Branch', 'Gender', 'Age Level', 'Education', 'Marital Status', 'house_ownership', 'Source of Incom', 'Incom', 'Expenses', 'Saving Amount', 'Monthly_Saving', 'who_will_earn', 'Social behavior', 'Loan Amount', 'Inst Amnt', 'Activity']


### Train-Test Split


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Initialize CatBoost for developing ML Model


In [37]:
model = CatBoostClassifier(
    iterations=500,
    depth=8,
    learning_rate=0.1,
    cat_features=categorical_cols,
    \
    verbose=100,
     eval_metric='F1',
    class_weights=[5, 1],
)



### Train the Model


In [38]:
model.fit(X_train, y_train)

0:	learn: 0.8206412	total: 189ms	remaining: 1m 34s
100:	learn: 0.9988109	total: 13.3s	remaining: 52.7s
200:	learn: 0.9990424	total: 25.3s	remaining: 37.7s
300:	learn: 0.9991746	total: 37s	remaining: 24.5s
400:	learn: 0.9991746	total: 49.3s	remaining: 12.2s
499:	learn: 0.9991746	total: 1m 1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1f4caaf3b60>

### Prediction the model


In [39]:
y_pred = model.predict(X_test)
print(y_pred)


['Tax not paid' 'Tax not paid' 'Tax paid' ... 'Tax not paid' 'Tax paid'
 'Tax not paid']


### Evaluation/ Classification Report & Confusion Matrix


In [40]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

Tax not paid       1.00      1.00      1.00      2872
    Tax paid       1.00      1.00      1.00      3782

    accuracy                           1.00      6654
   macro avg       1.00      1.00      1.00      6654
weighted avg       1.00      1.00      1.00      6654


Confusion Matrix:
[[2872    0]
 [   8 3774]]


### After training the model


#### Save the model


In [41]:
model.save_model("PMLP_catboost_model7.1.1.cbm")


In [42]:
# Get unique region names and sort them
region_names = sorted(data['Region'].unique())

# Print each region with a serial number
for i, name in enumerate(region_names, start=1):
    print(f"{i}. {name}")



1. AJ&K
2. Abbottabad
3. Attock
4. Bahawalnagar
5. Bahawalpur
6. Chakwal
7. DG Khan
8. FATA
9. Faisalabad
10. Gilgit Baltistan
11. Gujranwala
12. Jhang
13. Jhelum
14. Karachi
15. Kasur
16. Kohat
17. Lahore
18. Mianwali
19. Multan
20. Nowshera
21. Quetta
22. Rawalpindi
23. Sahiwal
24. Sargodha
25. Sheikhupura
26. Sukkur


In [43]:
# Get unique region names and sort them
region_names = sorted(data['Area'].unique())

# Print each region with a serial number
for i, name in enumerate(region_names, start=1):
    print(f"{i}. {name}")


1. Abbaspur
2. Abbottabad
3. Alipur
4. Attock
5. Bahawalnagar
6. Bahawalpur
7. Bajaur
8. Bannu
9. Battagram
10. Bhakkar
11. Bhera
12. Burewala
13. Chakwal
14. Chiniot
15. Chishtian
16. Chitral
17. D.G Khan
18. D.I. Khan
19. Danyor
20. Data sahib
21. Diamer
22. Faisalabad
23. Fateh Jang
24. Gareebabad
25. Ghanche
26. Ghizer
27. Gilgit
28. Gujranwala
29. Gujrat
30. Gwadar
31. Hafizabad
32. Hunza Nagar
33. Hyderabad
34. Islamabad
35. Jacob Abad
36. Jahanian
37. Jand
38. Jhang
39. Jhelum
40. Jhelum Valley
41. Kala Bagh
42. Kallar Kahar
43. Kamoke
44. Karachi
45. Kasur
46. Khanpur
47. Khushab
48. Khuzdar
49. Khyber
50. Kohat
51. Korangi
52. Kot Abdul Malik
53. Kot Momin
54. Kotli
55. Kurrum
56. Lasbela
57. Layyah
58. Lodhran(Bahawalpur2)
59. Loralai
60. Madhulal
61. Mandi Bahauddin
62. Mansehra
63. Mardan
64. Mian Channu
65. Mianwali
66. Mingora
67. Mirpur
68. Mohmand
69. Multan
70. Muridke
71. Muzaffarabad
72. Muzaffargarh
73. Narowal
74. Nishtar
75. Nowshera
76. Nushki
77. Okara
78. Patto

In [44]:
# Get unique region names and sort them
region_names = sorted(data['Branch'].unique())

# Print each region with a serial number
for i, name in enumerate(region_names, start=1):
    print(f"{i}. {name}")


1. 49 Tail(Agri)
2. 5 Chak Bara
3. 79 Adda (Agri)
4. 88 Phatak
5. Abbaspur
6. Abbottabad
7. Abbottabad-2
8. Abdul Hakeem
9. Adda Dad Fatiana
10. Adhi kot
11. Adhwal
12. Adyala Road
13. Ahmad Pur Lamma
14. Ahmadpur Sial
15. Ahmedpur Sharqia-1
16. Air Port
17. Aka Khel
18. Akbarpura
19. Akhtar Colony
20. Akora Khattak
21. Alam Chowk
22. Ali Abad
23. Ali Pur Chatha
24. Ali Pur-1
25. Ali Pur-2
26. Ali Town
27. Allai
28. Alpuri
29. Amar Pura
30. Arifwala
31. Arya Mohallah
32. Arzani Pur
33. Astor
34. Athara Hazari
35. Athmuqam
36. Attock Agri
37. Attock-1
38. Attock-2
39. Attock-4
40. Attock-5
41. Ayun
42. Babar Mela
43. Babu Wala
44. Badami Bagh
45. Badar Colony
46. Badar Colony-2
47. Badiana
48. Badin
49. Baffa
50. Bagan
51. Bagh
52. Baghbanpura
53. Bagrian
54. Bahawalnagar-1
55. Bahawalnagar-2
56. Bahawalnagar-3
57. Bahawalpur Agriculture
58. Bahawalpur Bypass
59. Bahawalpur-1
60. Bahawalpur-2
61. Bahawalpur-3
62. Baka Khel
63. Bakhshu Pull
64. Balakot
65. Baldia Town
66. Balkasar
67. Ba

In [45]:
# Get unique region names and sort them
region_names = sorted(data['Inst Amnt'].unique())

# Print each region with a serial number
for i, name in enumerate(region_names, start=1):
    print(f"{i}. {name}")


1. 10,000
2. 10,017
3. 10,025
4. 10,082
5. 10,177
6. 10,194
7. 10,204
8. 10,322
9. 10,401
10. 10,813
11. 11,032
12. 11,059
13. 11,123
14. 11,130
15. 11,225
16. 11,352
17. 11,796
18. 11,977
19. 12,245
20. 12,288
21. 12,608
22. 12,857
23. 12,976
24. 13,061
25. 13,265
26. 13,623
27. 13,674
28. 13,776
29. 13,878
30. 13,903
31. 13,980
32. 14,184
33. 14,286
34. 14,490
35. 14,745
36. 14,830
37. 15,306
38. 15,760
39. 16,327
40. 16,531
41. 17,347
42. 17,857
43. 18,167
44. 18,368
45. 18,538
46. 18,912
47. 19,286
48. 19,660
49. 2,449
50. 2,551
51. 20,408
52. 21,769
53. 22,704
54. 3,061
55. 3,152
56. 3,571
57. 3,686
58. 4,082
59. 4,133
60. 4,184
61. 4,490
62. 4,541
63. 4,592
64. 4,694
65. 4,728
66. 4,857
67. 4,898
68. 4,915
69. 5,000
70. 5,102
71. 5,510
72. 5,561
73. 5,612
74. 5,676
75. 5,714
76. 5,786
77. 5,816
78. 6,020
79. 6,082
80. 6,123
81. 6,144
82. 6,304
83. 6,531
84. 6,678
85. 6,735
86. 6,811
87. 6,888
88. 6,898
89. 6,939
90. 7,143
91. 7,347
92. 7,373
93. 7,415
94. 7,551
95. 7,653
96. 7,75

In [46]:
from sklearn.metrics import confusion_matrix

# Get predicted values
y_pred = model.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Get misclassified rows
wrong_preds = X_test[(y_test == 1) & (y_pred == 0)]  # Tax Paid predicted as Not Paid
print(wrong_preds)


Empty DataFrame
Columns: [Region, Area, Branch, Branch Code, Gender, Age, Age Level, Education, Marital Status, house_ownership, total_family_members, no_of_earning_hands, Source of Incom, Incom, Expenses, Saving Amount, Monthly_Saving, who_will_earn, Social behavior, Loan Amount, Inst Months, Inst Amnt, Activity]
Index: []

[0 rows x 23 columns]


In [47]:
print(cm)

[[2872    0]
 [   8 3774]]


In [48]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Encode target to 0/1
data = data.copy()
le = LabelEncoder()
data['Tax_encoded'] = le.fit_transform(data['Tax '])  # 'Tax paid' = 1, 'Tax not paid' = 0

# Encode categorical columns
data_encoded = data.drop(columns=['Tax '])
data_encoded = pd.get_dummies(data_encoded, drop_first=True)

# Correlation with the target
correlations = data_encoded.corrwith(data['Tax_encoded']).abs().sort_values(ascending=False)
print(correlations.head(10))  # Top 10 most correlated features



Tax_encoded            1.000000
Area_Theeng More       0.385546
Region_Kasur           0.383746
Region_Quetta          0.211990
Branch Code            0.187468
Area_Quetta            0.177121
Area_Jhelum Valley     0.161462
Branch_Hajji Chand     0.151454
who_will_earn_Other    0.150127
Region_Sahiwal         0.148568
dtype: float64
