In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

df = pd.read_excel("C:\\Users\\Admin\\Desktop\\DATN\\data.xlsx")
df


Unnamed: 0,ResponseId,MainBranch,Age,Country,EdLevel,WorkExp,CompTotal,RemoteWork,Employment,OrgSize,...,OfficeStackSyncHaveWorkedWith,SOComm,AISelect,AISent,AIBen,AIAcc,AIToolCurrently Using,TBranch,ICorPM,BranchGroup
0,393,I am a developer by profession,35-44,United Kingdom Of Great Britain And Northern I...,Professional,18,126420.0,Hybrid,Employ,Large,...,Slack,No,Yes,Unfavorable,Improve accuracy in coding,Somewhat distrust,Writing code,Yes,People manager,Developer
1,417,I am a developer by profession,35-44,Brazil,Master,17,170000.0,Remote,Employ-Freelance,Medium,...,Google Meet-Telegram-Whatsapp-Zoom,No,Yes,Favorable,Increase productivity,Somewhat distrust,Writing code-Documenting code-Testing code,Yes,Individual contributor,Developer
2,424,I am a developer by profession,25-34,Italy,No Degree,4,62000.0,Remote,Employ-Freelance,Medium,...,Google Meet-Mattermost-Slack,Yes,Yes,Unfavorable,Increase productivity,Highly distrust,Writing code,Yes,Individual contributor,Developer
3,444,"I am not primarily a developer, but I write co...",35-44,United Kingdom Of Great Britain And Northern I...,Bachelor,20,127000.0,Remote,Employ,Enterprise,...,Google Meet-Microsoft Teams-Skype-Slack-Telegr...,Somewhat,Yes,Favorable,Increase productivity-Greater efficiency-Speed...,Somewhat trust,Writing code-Search for answers-Generating con...,Yes,People manager,Semi-technical
4,445,I am a developer by profession,25-34,United Kingdom Of Great Britain And Northern I...,Bachelor,10,115000.0,Remote,Employ-Student,Small,...,Slack-Zoom,Neutral,Yes,Favorable,Speed up learning,Neither trust nor distrust,Learning about a codebase-Writing code,Yes,Individual contributor,Developer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10100,65106,I am a developer by profession,25-34,United States Of America,Bachelor,8,155000.0,Hybrid,Employ,Enterprise,...,Symphony-Zoom,Somewhat,Yes,Favorable,Increase productivity-Greater efficiency,Somewhat trust,Writing code-Debugging and getting help-Testin...,Yes,Individual contributor,Developer
10101,65152,I am a developer by profession,35-44,Estonia,Master,14,90000.0,Hybrid,Employ,Large,...,Slack-Zoom,Neutral,Yes,Favorable,Increase productivity-Greater efficiency,Somewhat distrust,Writing code-Generating content or synthetic data,Yes,Individual contributor,Developer
10102,65164,I am a developer by profession,18-24,Germany,Associate,3,46000.0,Hybrid,Employ,Small,...,Discord-Microsoft Teams-Telegram-Whatsapp,Neutral,Yes,Favorable,Increase productivity-Greater efficiency-Impro...,Somewhat trust,Writing code-Documenting code-Debugging and ge...,Yes,Individual contributor,Developer
10103,65167,I am a developer by profession,25-34,Spain,Bachelor,8,81600.0,Remote,Employ,Large,...,Google Chat-Google Meet-Microsoft Teams-Skype-...,No,Yes,Very favorable,Increase productivity-Greater efficiency-Speed...,Somewhat trust,Learning about a codebase-Debugging and gettin...,Yes,Individual contributor,Developer


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10105 entries, 0 to 10104
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ResponseId                      10105 non-null  int64  
 1   MainBranch                      10105 non-null  object 
 2   Age                             10105 non-null  object 
 3   Country                         10105 non-null  object 
 4   EdLevel                         10105 non-null  object 
 5   WorkExp                         10105 non-null  int64  
 6   CompTotal                       10105 non-null  float64
 7   RemoteWork                      10105 non-null  object 
 8   Employment                      10105 non-null  object 
 9   OrgSize                         10105 non-null  object 
 10  JobSat                          10105 non-null  int64  
 11  TimeSearching                   10105 non-null  int64  
 12  TimeAnswering                   

In [12]:
class ModelingThunhap:
    def __init__(self, df):
        self.df = df.copy()
        self.df_model = None
        self.pipeline = None


    def chuyen_doi_workexp(self, val):
        if val == 'More than 50 years':
            return 51
        elif val == 'Less than 1 year':
            return 0.5
        try:
            return float(val)
        except:
            return None

    def tien_xu_ly(self):
        df = self.df.copy()
        df = df[['DevType', 'Country', 'EdLevel', 'WorkExp', 'CompTotal', 'RemoteWork', 'OrgSize']]
        df['WorkExp'] = df['WorkExp'].apply(self.chuyen_doi_workexp)
        df = df.dropna(subset=['WorkExp', 'CompTotal'])
        df = df[(df['CompTotal'] > 1000) & (df['CompTotal'] < 5e6)]
        valid_countries = df['Country'].value_counts()
        df = df[df['Country'].isin(valid_countries[valid_countries >= 30].index)]
        self.df_model = df

    def train_model(self):
        X = self.df_model.drop(columns='CompTotal')
        y = self.df_model['CompTotal']

        cot_so = ['WorkExp']
        cot_phan_loai = ['DevType', 'Country', 'EdLevel', 'RemoteWork']

        xu_ly = ColumnTransformer([
            ('so', SimpleImputer(strategy='mean'), cot_so),
            ('phan_loai', Pipeline([
                ('impute', SimpleImputer(strategy='most_frequent')),
                ('encode', OneHotEncoder(handle_unknown='ignore'))
            ]), cot_phan_loai)
        ])

        self.pipeline = Pipeline([
            ('xu_ly', xu_ly),
            ('model', LinearRegression())
        ])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.pipeline.fit(X_train, y_train)


        self.df_model['LuongDuDoan'] = self.pipeline.predict(X)
        self.df_model = self.df_model[(self.df_model['LuongDuDoan'] > 1000) & 
                                      (self.df_model['LuongDuDoan'] < 5e6)]

    def tong_hop_theo(self, cot_nhom, ten_moi):
        kq = self.df_model.groupby(cot_nhom)['LuongDuDoan'].mean().reset_index()
        kq['LuongDuDoan'] = kq['LuongDuDoan'].round(0).apply(lambda x: f"{x:,.2f} USD")
        kq = kq.rename(columns={
            cot_nhom: ten_moi,
            'LuongDuDoan': 'Thu nhập dự đoán'
        })
        return kq

    def tong_hop_theo_kinh_nghiem(self):
        kq = self.df_model.groupby('WorkExp')['LuongDuDoan'].mean().reset_index()
        kq['LuongDuDoan'] = kq['LuongDuDoan'].round(0).apply(lambda x: f"{x:,.2f} USD")
        kq = kq.rename(columns={
            'WorkExp': 'Kinh nghiệm (năm)',
            'LuongDuDoan': 'Thu nhập dự đoán'
        })
        return kq


# Khởi tạo model
model = ModelingThunhap(df)
model.tien_xu_ly()
model.train_model()

print("\nThu nhập dự đoán theo ngành nghề:")
print(model.tong_hop_theo('DevType', 'Ngành nghề'))

print("\nTheo hình thức làm việc:")
print(model.tong_hop_theo('RemoteWork', 'Hình thức làm việc'))

print("\nTheo quốc gia:")
print(model.tong_hop_theo('Country', 'Quốc gia'))

print("\nTheo trình độ học vấn:")
print(model.tong_hop_theo('EdLevel', 'Trình độ học vấn'))

print("\nTheo quy mô công ty:")
print(model.tong_hop_theo('OrgSize', 'Quy mô công ty'))

print("\nTheo số năm kinh nghiệm:")
print(model.tong_hop_theo_kinh_nghiem())



Thu nhập dự đoán theo ngành nghề:
             Ngành nghề Thu nhập dự đoán
0    Back-End Developer   424,140.00 USD
1    Embedded Developer   265,108.00 USD
2   Front-End Developer   454,102.00 USD
3  Full-Stack Developer   334,035.00 USD
4        Game Developer   344,094.00 USD
5               Manager   283,586.00 USD
6      Mobile Developer   499,872.00 USD
7                 Other   334,437.00 USD
8            Researcher   277,392.00 USD

Theo hình thức làm việc:
  Hình thức làm việc Thu nhập dự đoán
0             Hybrid   340,542.00 USD
1          In-person   592,050.00 USD
2             Remote   332,992.00 USD

Theo quốc gia:
                                             Quốc gia  Thu nhập dự đoán
0                                           Argentina    145,153.00 USD
1                                           Australia    180,517.00 USD
2                                             Austria     96,640.00 USD
3                                          Bangladesh  1,158,851.00 USD
4