# import important Library

In [355]:
import pandas as pd
import numpy as np
from scipy.stats import stats
from scipy.stats import spearmanr, pearsonr
from scipy.stats import kruskal
from scipy.stats import chi2_contingency
from scipy.stats import median_test
from scipy.stats import friedmanchisquare
from scipy.stats import mannwhitneyu


# Clean data 

## Clean and Prepare Attribute Salary

In [356]:
'''Import data'''
#Saraly
file_path = r'C:\Nonpara_Project\Software Engineer Salaries.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.)
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.)
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.)
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.)
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.)


### สร้าง Function 

In [357]:

# Create new columns 'Country' and 'City' by splitting the 'Location' column
data[['City', 'state']] = data['Location'].str.split(',', expand=True)

def extract_salary_range(salary_str):
    """Extracts the lower and upper bounds of the salary range from a string.
    Args:
        salary_str: The salary string (e.g., '$68K - $94K (Glassdoor est.)').
    Returns:
        A tuple containing the lower and upper salary bounds (as integers), or None if
        the string cannot be parsed.
    """
    # ตรวจสอบว่าค่าที่รับเข้ามาเป็นสตริงหรือไม่
    if isinstance(salary_str, str):
        # ลบอักขระที่ไม่จำเป็น
        salary_str = salary_str.replace('(', '').replace(')', '').replace('$', '').replace('K', '')
        
        # แยกสตริงตามเครื่องหมายขีด
        parts = salary_str.split('-')
        
        if len(parts) == 2:  # ถ้ามีช่วงเงินเดือน
            try:
                lower = int(parts[0].strip())
                upper = int(parts[1].strip().split()[0])  # จัดการกับข้อความส่วนเกินหลังขีดบน
                return lower, upper
            except ValueError:
                return None
        
        elif len(parts) == 1:  # ถ้ามีแค่ค่าเดียว
            try:
                value = int(parts[0].strip())
                return value, value  # กำหนดให้ค่าขีดบนและขีดล่างเท่ากัน
            except ValueError:
                return None
            
    return None  # ถ้าค่าไม่สามารถถูกแปลงได้


### เรียกใช้ฟังก์ชั่น

In [358]:
#ใช้งานฟังก์ชันกับคอลัมน์ Salary
data[['LowerSalary', 'UpperSalary']] = data['Salary'].apply(lambda x: pd.Series(extract_salary_range(x)))

### สร้างคอลัมน์ แสดงตัวแทนของเงินเดือน

In [359]:
#สร้าง columns Mid_Saraly เพื่อใช้เป็นตัวแทนของข้อมูลเงินเดือนที่กรอกค่ามาให้เป็นช่วง
data['Mid_Salary'] = [(lower + upper) / 2 for lower, upper in zip(data['LowerSalary'], data['UpperSalary'])]

In [360]:
data.sort_values(by='Mid_Salary',ascending=False).head()

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,City,state,LowerSalary,UpperSalary,Mid_Salary
526,BOEING,4.0,Senior Software Engineer,"Washington, DC",30d+,$343K - $637K (Employer est.),Washington,DC,343.0,637.0,490.0
578,INTEL,5.0,"Software Engineer 2 - Mainframe (Cobol, JCL, V...","Brookfield, WI",12d,$310K - $465K (Employer est.),Brookfield,WI,310.0,465.0,387.5
668,Workday,4.0,Software Engineer - Camera (Technical Leadership),"Redmond, WA",3d,$300K - $450K (Employer est.),Redmond,WA,300.0,450.0,375.0
157,Intuitive Machines LLC,3.8,Research Software Engineer,"Houston, TX",10d,$295K - $440K (Employer est.),Houston,TX,295.0,440.0,367.5
743,Teradyne,3.6,Software Engineer (Onsite),"Columbia, MD",25d,$248K - $385K (Employer est.),Columbia,MD,248.0,385.0,316.5


In [361]:
data.sort_values(by='Mid_Salary',ascending=True).head()

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,City,state,LowerSalary,UpperSalary,Mid_Salary
69,KAIROS Inc,3.5,Software Engineer,"Dahlgren, VA",16d,$6K - $7K (Employer est.),Dahlgren,VA,6.0,7.0,6.5
632,Geneva Technologies Inc.,3.4,Associate Design Engineer - Software V&V,"Wauwatosa, WI",30d+,$8K - $12K (Employer est.),Wauwatosa,WI,8.0,12.0,10.0
350,Solvere Technical Group,3.6,Software Engineer,"Fall River, MA",30d+,$10K - $20K (Employer est.),Fall River,MA,10.0,20.0,15.0
375,UiPath,3.7,Expert Software Engineer,"Alpharetta, GA",1d,$50K - $60K (Employer est.),Alpharetta,GA,50.0,60.0,55.0
642,NMG Technology Services,3.8,Senior Software Engineer,Oregon,5d,$37K - $82K (Employer est.),Oregon,,37.0,82.0,59.5


### สร้าง columns Remote 

In [362]:
#สร้างคอลัมน์ใหม่ 'Remote' ที่ตรวจสอบว่ามี 'Remote' ใน 'Location' หรือไม่ 
data['Remote'] = data['Location'].str.contains('Remote').fillna(False)
data

  data['Remote'] = data['Location'].str.contains('Remote').fillna(False)


Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,City,state,LowerSalary,UpperSalary,Mid_Salary,Remote
0,ViewSoft,4.8,Software Engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),Manassas,VA,68.0,94.0,81.0,False
1,Workiva,4.3,Software Support Engineer,Remote,2d,$61K - $104K (Employer est.),Remote,,61.0,104.0,82.5,True
2,"Garmin International, Inc.",3.9,C# Software Engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),Cary,NC,95.0,118.0,106.5,False
3,Snapchat,3.5,"Software Engineer, Fullstack, 1+ Years of Expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),Los Angeles,CA,97.0,145.0,121.0,False
4,Vitesco Technologies Group AG,3.1,Software Engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),Seguin,TX,85.0,108.0,96.5,False
...,...,...,...,...,...,...,...,...,...,...,...,...
865,RXO,,"Software Engineer, Machine Learning Compute","San Francisco, CA",6d,,San Francisco,CA,,,,False
866,Infosys,,Software Engineer - 3 (Apache NiFi),"Annapolis Junction, MD",18d,,Annapolis Junction,MD,,,,False
867,Medtronic,,Senior Software Engineer,"Southfield, MI",19d,,Southfield,MI,,,,False
868,,,Junior Python Developer,"Charlotte, NC",2d,,Charlotte,NC,,,,False


In [363]:
data['Remote'] = data['Remote'].replace({True: 'Yes', False: 'No'})


### แบ่งกลุ่มเงินเดือน

## Clean and Prepare Attribute "Job Title"

In [364]:
#ทำการ lower txt ใน Location เพื่อทำสามารถดึงข้อมูลได้อย่างครบถ้วน 
data['Job Title'] = data['Job Title'].str.lower()

In [365]:
# สร้างคอลัมน์ 'Group_job' โดยกำหนดค่าเริ่มต้นเป็น 'Other'
data['Group_job'] = 'Other'

In [366]:
# กำหนดเงื่อนไขสำหรับการจัดกลุ่ม
data.loc[data['Job Title'].str.contains('dev',case=False), 'Group_job'] = 'Software_Dev'


In [367]:
data.loc[data['Job Title'].str.contains('software', case=False), 'Group_job'] = 'Software_En'


In [368]:
data.loc[data['Job Title'].str.contains('cloud', case=False), 'Group_job'] = 'Software_clound'

In [369]:
# นับจำนวนค่าที่ไม่ซ้ำในคอลัมน์ 'Group_job'
unique_counts = data['Group_job'].value_counts()
unique_counts

Group_job
Software_En        777
Other               50
Software_Dev        29
Software_clound     14
Name: count, dtype: int64

### สร้างตัวแปร group_mean แทนค่าว่างของเงินเดือนด้วยค่าเฉลี่ยของเงินเดือนแต่ละกรุ๊ป

In [370]:
# คำนวณค่าเฉลี่ยของ Mid_Salary ตาม Group_job
group_mean = data.groupby('Group_job')['Mid_Salary'].transform('mean')

In [371]:
# แทนค่าว่างใน Mid_Salary ด้วยค่าเฉลี่ยของกลุ่ม
data['Mid_Salary'].fillna(group_mean, inplace=True)
data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Mid_Salary'].fillna(group_mean, inplace=True)


Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,City,state,LowerSalary,UpperSalary,Mid_Salary,Remote,Group_job
0,ViewSoft,4.8,software engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),Manassas,VA,68.0,94.0,81.000000,No,Software_En
1,Workiva,4.3,software support engineer,Remote,2d,$61K - $104K (Employer est.),Remote,,61.0,104.0,82.500000,Yes,Software_En
2,"Garmin International, Inc.",3.9,c# software engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),Cary,NC,95.0,118.0,106.500000,No,Software_En
3,Snapchat,3.5,"software engineer, fullstack, 1+ years of expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),Los Angeles,CA,97.0,145.0,121.000000,No,Software_En
4,Vitesco Technologies Group AG,3.1,software engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),Seguin,TX,85.0,108.0,96.500000,No,Software_En
...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,RXO,,"software engineer, machine learning compute","San Francisco, CA",6d,,San Francisco,CA,,,129.336193,No,Software_En
866,Infosys,,software engineer - 3 (apache nifi),"Annapolis Junction, MD",18d,,Annapolis Junction,MD,,,129.336193,No,Software_En
867,Medtronic,,senior software engineer,"Southfield, MI",19d,,Southfield,MI,,,129.336193,No,Software_En
868,,,junior python developer,"Charlotte, NC",2d,,Charlotte,NC,,,137.296296,No,Software_Dev


In [372]:
#แบ่งกลุ่มระดับเงินเป็น 5 ช่วง เท่าๆกัน 
# สร้างช่วงเงินเดือนที่กำหนดเอง
bins = [-float('inf'), 50, 100, 150, 200,250,300,350,float('inf')]  # ขีดจำกัดของแต่ละช่วง
labels = ['<50', '50-100', '101-150', '151-200','201-250','251-300','301-350','>350']  # ป้ายกำกับแต่ละช่วง
# แบ่งข้อมูลเงินเดือน (Mid_Salary) ออกเป็นช่วง
data['Salary_Range'] = pd.cut(data['Mid_Salary'], bins=bins, labels=labels, right=True)
data['Salary_Range'].value_counts()
data

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,City,state,LowerSalary,UpperSalary,Mid_Salary,Remote,Group_job,Salary_Range
0,ViewSoft,4.8,software engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),Manassas,VA,68.0,94.0,81.000000,No,Software_En,50-100
1,Workiva,4.3,software support engineer,Remote,2d,$61K - $104K (Employer est.),Remote,,61.0,104.0,82.500000,Yes,Software_En,50-100
2,"Garmin International, Inc.",3.9,c# software engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),Cary,NC,95.0,118.0,106.500000,No,Software_En,101-150
3,Snapchat,3.5,"software engineer, fullstack, 1+ years of expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),Los Angeles,CA,97.0,145.0,121.000000,No,Software_En,101-150
4,Vitesco Technologies Group AG,3.1,software engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),Seguin,TX,85.0,108.0,96.500000,No,Software_En,50-100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,RXO,,"software engineer, machine learning compute","San Francisco, CA",6d,,San Francisco,CA,,,129.336193,No,Software_En,101-150
866,Infosys,,software engineer - 3 (apache nifi),"Annapolis Junction, MD",18d,,Annapolis Junction,MD,,,129.336193,No,Software_En,101-150
867,Medtronic,,senior software engineer,"Southfield, MI",19d,,Southfield,MI,,,129.336193,No,Software_En,101-150
868,,,junior python developer,"Charlotte, NC",2d,,Charlotte,NC,,,137.296296,No,Software_Dev,101-150


## Clean and Prepare Attribute "Company"

In [373]:
data.columns
data['Company'].fillna('Unknow',inplace=True)
data.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Company'].fillna('Unknow',inplace=True)


Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,City,state,LowerSalary,UpperSalary,Mid_Salary,Remote,Group_job,Salary_Range
0,ViewSoft,4.8,software engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),Manassas,VA,68.0,94.0,81.0,No,Software_En,50-100
1,Workiva,4.3,software support engineer,Remote,2d,$61K - $104K (Employer est.),Remote,,61.0,104.0,82.5,Yes,Software_En,50-100
2,"Garmin International, Inc.",3.9,c# software engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),Cary,NC,95.0,118.0,106.5,No,Software_En,101-150
3,Snapchat,3.5,"software engineer, fullstack, 1+ years of expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),Los Angeles,CA,97.0,145.0,121.0,No,Software_En,101-150
4,Vitesco Technologies Group AG,3.1,software engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),Seguin,TX,85.0,108.0,96.5,No,Software_En,50-100


## Clean and Prepare Attribute "Company Score"

In [374]:
data['Company Score'].fillna(data['Company Score'].mean(),inplace=True)
data.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Company Score'].fillna(data['Company Score'].mean(),inplace=True)


Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,City,state,LowerSalary,UpperSalary,Mid_Salary,Remote,Group_job,Salary_Range
0,ViewSoft,4.8,software engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),Manassas,VA,68.0,94.0,81.0,No,Software_En,50-100
1,Workiva,4.3,software support engineer,Remote,2d,$61K - $104K (Employer est.),Remote,,61.0,104.0,82.5,Yes,Software_En,50-100
2,"Garmin International, Inc.",3.9,c# software engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),Cary,NC,95.0,118.0,106.5,No,Software_En,101-150
3,Snapchat,3.5,"software engineer, fullstack, 1+ years of expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),Los Angeles,CA,97.0,145.0,121.0,No,Software_En,101-150
4,Vitesco Technologies Group AG,3.1,software engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),Seguin,TX,85.0,108.0,96.5,No,Software_En,50-100


In [375]:
# แบ่งกลุ่มคะแนนบริษัทเป็น 3 ช่วง
# กำหนดช่วงคะแนน (เพิ่มช่วงบนเพื่อครอบคลุมคะแนนสูง)
bins_score = [0, 2, 4, float('inf')]  # ช่วงคะแนนต่ำ, กลาง, สูง
labels_score = ['ต่ำ', 'กลาง', 'สูง']

In [376]:
# ใช้ pd.cut() เพื่อแบ่งกลุ่มคะแนน
data['Company_Score_Group'] = pd.cut(data['Company Score'], bins=bins_score, labels=labels_score, right=False)

In [377]:
# แสดงจำนวนคะแนนในแต่ละกลุ่ม
print(data['Company_Score_Group'].value_counts())

Company_Score_Group
กลาง    510
สูง     359
ต่ำ       1
Name: count, dtype: int64


# Prepare data for tests And Test hypotensis 

## วัตถุประสงค์การศึกษาที่ 1 เพื่อศึกษาเกี่ยวกับเงินเดือนในสายงาน Software

In [378]:
# แยกข้อมูลเงินเดือนตามกลุ่มงาน
Sa_Software_En = data[data['Group_job'] == 'Software_En']['Mid_Salary']
Sa_Software_Dev = data[data['Group_job'] == 'Software_Dev']['Mid_Salary']
Sa_Software_clound = data[data['Group_job'] == 'Software_clound']['Mid_Salary']
Sa_Other = data[data['Group_job'] == 'Other']['Mid_Salary']

In [379]:
Sa_Software_En.head()

0     81.0
1     82.5
2    106.5
3    121.0
4     96.5
Name: Mid_Salary, dtype: float64

In [380]:
Sa_Software_Dev.head()

97     107.5
105     82.5
107    106.0
127    154.0
141    114.0
Name: Mid_Salary, dtype: float64

In [381]:
Sa_Software_clound.head()

91     106.5
114    187.5
171    108.5
172    112.5
176    168.0
Name: Mid_Salary, dtype: float64

In [382]:
Sa_Other.head()

5     149.0
15    112.5
25    133.0
63    160.0
85    126.0
Name: Mid_Salary, dtype: float64

### For question 1 ทดสอบว่าเงินเดือนของสายงาน Software ทั้ง 4 กลุ่มแตกต่างกันหรือไม่ 

In [383]:
# ทำการทดสอบ Kruskal-Wallis
stat, p_value = kruskal(Sa_Software_En, Sa_Software_Dev, Sa_Software_clound, Sa_Other)
# แสดงผลลัพธ์
print(f"Kruskal-Wallis H-statistic: {stat}")
print(f"P-value: {p_value}")


Kruskal-Wallis H-statistic: 1.351277600103452
P-value: 0.7169941931246939


### For Question 2 ทดสอบสมมติฐาน "ทดสอบความสัมพันธ์ของตัวแปรคะแนนของบริษัทและเงินเดือน"

In [384]:


from scipy.stats import spearmanr, pearsonr
# คำนวณ Spearman correlation
spearman_corr, spearman_p_value = spearmanr(data['Mid_Salary'], data['Company Score'])
# แสดงผลลัพธ์
print(f"Spearman Correlation Coefficient: {spearman_corr}")
print(f"P-value: {spearman_p_value}")


Spearman Correlation Coefficient: 0.025205222131919135
P-value: 0.45778667694508435


In [385]:
# คำนวณ Pearson correlation
pearson_corr, pearson_p_value = pearsonr(data['Mid_Salary'], data['Company Score'])
print(f"Pearson Correlation Coefficient: {pearson_corr}")
print(f"Pearson P-value: {pearson_p_value}")

Pearson Correlation Coefficient: 0.052383474104882935
Pearson P-value: 0.12260549092045343


### For Question 3 ระดับคะแนนของบริษัทมีผลต่อระดับเงินเดือนหรือไม่ 

In [386]:
# สร้างตารางไขว้ (contingency table) ระหว่าง Salary_Range และ Company_Score_Group
contingency_table_sa_co = pd.crosstab(data['Salary_Range'], data['Company_Score_Group'])
contingency_table_sa_co

Company_Score_Group,ต่ำ,กลาง,สูง
Salary_Range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
<50,0,3,0
50-100,0,108,87
101-150,1,310,184
151-200,0,62,64
201-250,0,21,16
251-300,0,4,5
301-350,0,1,0
>350,0,1,3


In [387]:

# ทำการทดสอบ Chi-Square
chi2, p_value, dof, expected = chi2_contingency(contingency_table_sa_co)

# แสดงผลลัพธ์
print("Expected frequencies:")
print(expected)
print(f"Degrees of freedom: {dof}")
print(f"Chi-Square statistic: {chi2}")
print(f"P-value: {p_value}")

Expected frequencies:
[[3.44827586e-03 1.75862069e+00 1.23793103e+00]
 [2.24137931e-01 1.14310345e+02 8.04655172e+01]
 [5.68965517e-01 2.90172414e+02 2.04258621e+02]
 [1.44827586e-01 7.38620690e+01 5.19931034e+01]
 [4.25287356e-02 2.16896552e+01 1.52678161e+01]
 [1.03448276e-02 5.27586207e+00 3.71379310e+00]
 [1.14942529e-03 5.86206897e-01 4.12643678e-01]
 [4.59770115e-03 2.34482759e+00 1.65057471e+00]]
Degrees of freedom: 14
Chi-Square statistic: 15.182977057391282
P-value: 0.36576210838124673


## วัตถุประสงค์ที่ 2 เพื่อศึกษาเกี่ยวกับการทำงานแบบ Remote 


In [388]:
data['Group_job'].value_counts()

Group_job
Software_En        777
Other               50
Software_Dev        29
Software_clound     14
Name: count, dtype: int64

In [389]:
data['Remote'].value_counts()

Remote
No     831
Yes     39
Name: count, dtype: int64

In [390]:
Software_En=data[data['Group_job']=='Software_En']


In [391]:
Software_Dev=data[data['Group_job']=='Software_Dev']


In [392]:
Software_clound=data[data['Group_job']=='Software_clound']


### For question 1 เงินเดือนของของแต่ละ Group_Job ในรูปแบบการทำงานแบบ Remote และ แบบ onsite  มีความแตกต่างกันหรือไม่ ?


In [394]:
Software_En.info()

<class 'pandas.core.frame.DataFrame'>
Index: 777 entries, 0 to 869
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Company              777 non-null    object  
 1   Company Score        777 non-null    float64 
 2   Job Title            777 non-null    object  
 3   Location             767 non-null    object  
 4   Date                 777 non-null    object  
 5   Salary               680 non-null    object  
 6   City                 767 non-null    object  
 7   state                662 non-null    object  
 8   LowerSalary          641 non-null    float64 
 9   UpperSalary          641 non-null    float64 
 10  Mid_Salary           777 non-null    float64 
 11  Remote               777 non-null    object  
 12  Group_job            777 non-null    object  
 13  Salary_Range         777 non-null    category
 14  Company_Score_Group  777 non-null    category
dtypes: category(2), float64(4), 

In [395]:
# Separate Remote (Yes) and Onsite (No) salaries
en_remote_salaries = Software_En[Software_En['Remote'] == 'Yes']['Mid_Salary']
en_onsite_salaries = Software_En[Software_En['Remote'] == 'No']['Mid_Salary']

In [396]:
Software_En[Software_En['Remote']=='Yes']['Mid_Salary'].mean()

121.67865404271345

In [397]:
Software_En[Software_En['Remote']=='No']['Mid_Salary'].mean()

129.63307665996481

In [398]:
from scipy.stats import median_test

# Perform Median Test
if not en_remote_salaries.empty and not en_onsite_salaries.empty:
    stat_en, p_value_en, _, _ = median_test(en_remote_salaries, en_onsite_salaries)
    print(f"Median Test statistic: {stat_en}")
    print(f"P-value (Median Test): {p_value_en}")



Median Test statistic: 2.622028561674244
P-value (Median Test): 0.10538963219160508


In [399]:
Software_Dev[Software_Dev['Remote']=='Yes']['Mid_Salary'].mean()

121.54938271604938

In [400]:
Software_Dev[Software_Dev['Remote']=='No']['Mid_Salary'].mean()

141.40418679549114

In [401]:
# Separate Remote (Yes) and Onsite (No) salaries
Dev_remote_salaries = Software_Dev[Software_Dev['Remote'] == 'Yes']['Mid_Salary']
Dev_onsite_salaries = Software_Dev[Software_Dev['Remote'] == 'No']['Mid_Salary']


In [402]:
Dev_onsite_salaries

97     107.500000
105     82.500000
127    154.000000
149     80.000000
153    103.500000
191    153.500000
206     94.000000
235    103.000000
248    125.000000
328    113.000000
363    112.000000
372    285.000000
410    215.000000
414     98.500000
427    272.500000
439    153.500000
440    153.000000
444     97.500000
447    211.500000
458    195.500000
460    108.500000
572     96.500000
868    137.296296
Name: Mid_Salary, dtype: float64

In [403]:
Software_Dev[Software_Dev['Remote'] == 'Yes']['Mid_Salary'].median()

120.75

In [404]:
Software_Dev[Software_Dev['Remote'] == 'No']['Mid_Salary'].median()

113.0

In [405]:
from scipy.stats import median_test

# Perform Median Test
if not Dev_remote_salaries.empty and not Dev_onsite_salaries.empty:
    stat_dev, p_value_dev, _, _ = median_test(Dev_remote_salaries, Dev_onsite_salaries)
    print(f"Median Test statistic: {stat_dev}")
    print(f"P-value (Median Test): {p_value_dev}")


Median Test statistic: 0.0
P-value (Median Test): 1.0


In [406]:
#Software Clound
cl_remote_salaries=Software_clound[Software_clound['Remote']=='Yes']['Mid_Salary']
cl_onsite_salaries=Software_Dev[Software_Dev['Remote'] == 'No']['Mid_Salary']

In [407]:
Software_clound[Software_clound['Remote']=='Yes']['Mid_Salary'].median()

103.5

In [408]:
Software_Dev[Software_Dev['Remote'] == 'No']['Mid_Salary'].median()

113.0

In [409]:
cl_onsite_salaries

97     107.500000
105     82.500000
127    154.000000
149     80.000000
153    103.500000
191    153.500000
206     94.000000
235    103.000000
248    125.000000
328    113.000000
363    112.000000
372    285.000000
410    215.000000
414     98.500000
427    272.500000
439    153.500000
440    153.000000
444     97.500000
447    211.500000
458    195.500000
460    108.500000
572     96.500000
868    137.296296
Name: Mid_Salary, dtype: float64

In [410]:
from scipy.stats import median_test

# Perform Median Test
if not cl_remote_salaries.empty and not cl_onsite_salaries.empty:
    stat, p_value, _, _ = median_test(cl_remote_salaries, cl_onsite_salaries)
    print(f"Median Test statistic: {stat}")
    print(f"P-value (Median Test): {p_value}")


Median Test statistic: 0.0
P-value (Median Test): 1.0


### Questions 3 คะแนนบริษัทของการทำงานแบบ remote สูงกว่าการทำงานแบบ onsite หรือไม่ 

In [411]:
company_score_remote=data[data['Remote']=='Yes']['Company Score']

In [412]:
company_score_remote

1      4.300000
9      4.200000
11     3.800000
29     3.500000
60     3.800000
107    4.700000
141    3.500000
143    3.700000
152    4.400000
161    4.600000
167    4.200000
178    4.000000
181    3.800000
194    3.500000
199    4.000000
204    3.100000
229    4.400000
249    3.900000
296    3.900000
342    3.900000
354    4.500000
379    4.600000
424    3.800000
450    4.000000
465    3.900000
486    3.200000
505    4.100000
560    3.700000
563    3.900000
565    3.900000
604    3.300000
607    3.600000
635    3.700000
646    4.000000
676    3.900000
744    4.600000
776    3.700000
795    3.895311
801    3.895311
Name: Company Score, dtype: float64

In [413]:
company_score_onsite=data[data['Remote']=='No']['Company Score']

In [414]:
company_score_onsite

0      4.800000
2      3.900000
3      3.500000
4      3.100000
5      3.900000
         ...   
865    3.895311
866    3.895311
867    3.895311
868    3.895311
869    3.895311
Name: Company Score, Length: 831, dtype: float64

In [415]:
from scipy import stats

# ทดสอบสมมติฐานทางเดียว (Remote > Onsite)
if not company_score_remote.empty and not company_score_onsite.empty:
    mann_greater_stat,mann_greater_p_values = stats.mannwhitneyu(company_score_remote, company_score_onsite, alternative='greater')
    print(f"Mann-Whitney U Statistic: {mann_greater_stat}")
    print(f"P-value (One-Tailed Test): {mann_greater_p_values}")

Mann-Whitney U Statistic: 16909.5
P-value (One-Tailed Test): 0.32263953533850287


In [416]:
from scipy import stats

# ทดสอบสมมติฐานทางเดียว (Remote > Onsite)
if not company_score_remote.empty and not company_score_onsite.empty:
    mann_left_stat,mann_left_p_values = stats.mannwhitneyu(company_score_remote, company_score_onsite, alternative='greater')
    print(f"Mann-Whitney U Statistic: {mann_left_stat}")
    print(f"P-value (One-Tailed Test): {mann_left_p_values}")

Mann-Whitney U Statistic: 16909.5
P-value (One-Tailed Test): 0.32263953533850287


In [417]:
data.info()
data['City'].fillna('Unknow', inplace=True)
data['state'].fillna('Unknow',inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 870 entries, 0 to 869
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Company              870 non-null    object  
 1   Company Score        870 non-null    float64 
 2   Job Title            870 non-null    object  
 3   Location             857 non-null    object  
 4   Date                 870 non-null    object  
 5   Salary               764 non-null    object  
 6   City                 857 non-null    object  
 7   state                731 non-null    object  
 8   LowerSalary          722 non-null    float64 
 9   UpperSalary          722 non-null    float64 
 10  Mid_Salary           870 non-null    float64 
 11  Remote               870 non-null    object  
 12  Group_job            870 non-null    object  
 13  Salary_Range         870 non-null    category
 14  Company_Score_Group  870 non-null    category
dtypes: category(2), float64

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['City'].fillna('Unknow', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['state'].fillna('Unknow',inplace=True)


In [418]:
data.columns

Index(['Company', 'Company Score', 'Job Title', 'Location', 'Date', 'Salary',
       'City', 'state', 'LowerSalary', 'UpperSalary', 'Mid_Salary', 'Remote',
       'Group_job', 'Salary_Range', 'Company_Score_Group'],
      dtype='object')

In [419]:
import numpy as np
from scipy.stats import chisquare

# สร้างข้อมูลนับจำนวนการเกิดขึ้นจริงของแต่ละ state (จากคอลัมน์ 'state')
observed_counts = data['state'].value_counts().values

# คาดว่าการกระจายตัวควรจะเท่าๆ กัน แต่ต้องแน่ใจว่าผลรวมเท่ากัน
expected_counts = np.full_like(observed_counts, np.mean(observed_counts))

# ปรับ expected_counts ให้มีผลรวมเท่ากับ observed_counts
expected_counts = expected_counts * (observed_counts.sum() / expected_counts.sum())

# เรียกใช้ Chi-Square Test
chi_stat, chi_p_value = chisquare(observed_counts, f_exp=expected_counts)

print(f"Chi-Square Statistic: {chi_stat}")
print(f"P-Value: {chi_p_value}")


Chi-Square Statistic: 2046.9287356321843
P-Value: 0.0


In [420]:
observed_counts

array([139, 125,  75,  51,  49,  44,  39,  31,  30,  26,  24,  23,  21,
        20,  19,  17,  13,  12,  12,  11,  10,   8,   7,   5,   5,   4,
         4,   4,   4,   4,   4,   3,   3,   3,   3,   2,   2,   2,   2,
         2,   2,   2,   1,   1,   1,   1], dtype=int64)

In [421]:
expected_counts

array([18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348])

In [422]:
import numpy as np
from scipy.stats import chisquare

# สร้าง observed counts จากข้อมูล state (หรือข้อมูลหมวดหมู่อื่นๆ)
observed_counts = data['state'].value_counts().values

# คำนวณ expected counts แบบ uniform distribution
expected_counts = np.ones_like(observed_counts) * observed_counts.sum() / len(observed_counts)

# รวมเซลล์ที่มีค่า expected < 5
min_expected_threshold = 5
if np.any(expected_counts < min_expected_threshold):
    observed_counts = np.where(expected_counts < min_expected_threshold, 
                               observed_counts[expected_counts >= min_expected_threshold].sum(), 
                               observed_counts)
    expected_counts = np.where(expected_counts < min_expected_threshold, 
                               expected_counts[expected_counts >= min_expected_threshold].sum(), 
                               expected_counts)

# เรียกใช้ Chi-Square Test
chi_stat, chi_p_value = chisquare(observed_counts, f_exp=expected_counts)

print(f"Chi-Square Statistic: {chi_stat}")
print(f"P-Value: {chi_p_value}")


Chi-Square Statistic: 2046.9287356321843
P-Value: 0.0


In [423]:
expected_counts

array([18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348, 18.91304348, 18.91304348, 18.91304348, 18.91304348,
       18.91304348])

In [424]:
import numpy as np
from scipy.stats import kstest, norm

# สมมติฐานว่าเงินเดือนในแบบ Remote และ Onsite ถูกดึงมา
s_remote_salaries = data[data['Remote'] == 'Yes']['Mid_Salary']
s_onsite_salaries = data[data['Remote'] == 'No']['Mid_Salary']


In [425]:
# ทดสอบการกระจายตัวของ Remote Salaries กับการกระจายปกติ
stat_remote, p_value_remote = kstest(s_remote_salaries, 'norm', args=(np.mean(s_remote_salaries), np.std(s_remote_salaries)))

# ทดสอบการกระจายตัวของ Onsite Salaries กับการกระจายปกติ
stat_onsite, p_value_onsite = kstest(s_onsite_salaries, 'norm', args=(np.mean(s_onsite_salaries), np.std(s_onsite_salaries)))

# แสดงผลลัพธ์
print(f"Kolmogorov-Smirnov Test for Remote Salaries: Statistic={stat_remote}, P-Value={p_value_remote}")
print(f"Kolmogorov-Smirnov Test for Onsite Salaries: Statistic={stat_onsite}, P-Value={p_value_onsite}")


Kolmogorov-Smirnov Test for Remote Salaries: Statistic=0.18840608837616857, P-Value=0.10986072068159913
Kolmogorov-Smirnov Test for Onsite Salaries: Statistic=0.1777787337939234, P-Value=1.9040207747490034e-23


In [426]:
data.columns

Index(['Company', 'Company Score', 'Job Title', 'Location', 'Date', 'Salary',
       'City', 'state', 'LowerSalary', 'UpperSalary', 'Mid_Salary', 'Remote',
       'Group_job', 'Salary_Range', 'Company_Score_Group'],
      dtype='object')

In [427]:
data

Unnamed: 0,Company,Company Score,Job Title,Location,Date,Salary,City,state,LowerSalary,UpperSalary,Mid_Salary,Remote,Group_job,Salary_Range,Company_Score_Group
0,ViewSoft,4.800000,software engineer,"Manassas, VA",8d,$68K - $94K (Glassdoor est.),Manassas,VA,68.0,94.0,81.000000,No,Software_En,50-100,สูง
1,Workiva,4.300000,software support engineer,Remote,2d,$61K - $104K (Employer est.),Remote,Unknow,61.0,104.0,82.500000,Yes,Software_En,50-100,สูง
2,"Garmin International, Inc.",3.900000,c# software engineer,"Cary, NC",2d,$95K - $118K (Glassdoor est.),Cary,NC,95.0,118.0,106.500000,No,Software_En,101-150,กลาง
3,Snapchat,3.500000,"software engineer, fullstack, 1+ years of expe...","Los Angeles, CA",2d,$97K - $145K (Employer est.),Los Angeles,CA,97.0,145.0,121.000000,No,Software_En,101-150,กลาง
4,Vitesco Technologies Group AG,3.100000,software engineer,"Seguin, TX",2d,$85K - $108K (Glassdoor est.),Seguin,TX,85.0,108.0,96.500000,No,Software_En,50-100,กลาง
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,RXO,3.895311,"software engineer, machine learning compute","San Francisco, CA",6d,,San Francisco,CA,,,129.336193,No,Software_En,101-150,กลาง
866,Infosys,3.895311,software engineer - 3 (apache nifi),"Annapolis Junction, MD",18d,,Annapolis Junction,MD,,,129.336193,No,Software_En,101-150,กลาง
867,Medtronic,3.895311,senior software engineer,"Southfield, MI",19d,,Southfield,MI,,,129.336193,No,Software_En,101-150,กลาง
868,Unknow,3.895311,junior python developer,"Charlotte, NC",2d,,Charlotte,NC,,,137.296296,No,Software_Dev,101-150,กลาง


In [428]:
data['City'].value_counts()


City
United States         48
Remote                39
Annapolis Junction    35
San Francisco         28
Seattle               28
                      ..
San Antonio            1
Lorton                 1
Onalaska               1
Suwanee                1
Southfield             1
Name: count, Length: 324, dtype: int64

In [429]:
import numpy as np
import pandas as pd
from statsmodels.sandbox.stats.runs import runstest_1samp


In [430]:
z_stat, p_value = runstest_1samp(data['Mid_Salary'], correction=False)


In [431]:
print(f"Z-Statistic: {z_stat}")
print(f"P-Value: {p_value}")



Z-Statistic: -1.3809269151801908
P-Value: 0.16730143331499114


In [432]:

# นับจำนวนข้อมูลในแต่ละเมือง
city_counts = data['City'].value_counts()
city_counts

City
United States         48
Remote                39
Annapolis Junction    35
San Francisco         28
Seattle               28
                      ..
San Antonio            1
Lorton                 1
Onalaska               1
Suwanee                1
Southfield             1
Name: count, Length: 324, dtype: int64

In [433]:

# เตรียมข้อมูลสำหรับการทดสอบ
city_counts = city_counts.sort_index()  # เรียงตามชื่อเมือง
city_counts_values = city_counts.values  # ค่า
n = len(city_counts_values)  # จำนวนเมือง


In [434]:
# เรียกใช้ Run Test for Randomness
z_stat, p_value = runstest_1samp(city_counts_values, correction=False)

In [435]:
# แสดงผลลัพธ์
print(f"Z-Statistic: {z_stat}")
print(f"P-Value: {p_value}")

Z-Statistic: -1.2217036503204073
P-Value: 0.2218197153831355


In [438]:
# นับจำนวนข้อมูลในแต่ละเมือง
state_counts = data['state'].value_counts()
state_counts

state
Unknow    139
 CA       125
 MD        75
 WA        51
 TX        49
 MA        44
 VA        39
 FL        31
 NY        30
 GA        26
 IL        24
 MI        23
 CO        21
 NJ        20
 PA        19
 NC        17
 WI        13
 AZ        12
 OH        12
 MN        11
 DC        10
 CT         8
 UT         7
 OR         5
 IN         5
 KS         4
 ID         4
 AL         4
 OK         4
 IA         4
 MS         4
 VT         3
 DE         3
 NE         3
 SC         3
 RI         2
 PR         2
 AR         2
 WV         2
 NH         2
 KY         2
 NM         2
 HI         1
 TN         1
 MO         1
 NV         1
Name: count, dtype: int64

In [439]:
# เตรียมข้อมูลสำหรับการทดสอบ
state_counts = state_counts.sort_index()  # เรียงตามชื่อเมือง
state_counts_values = state_counts.values  # ค่า
n = len(state_counts_values)  # จำนวนเมือง


In [440]:
# เรียกใช้ Run Test for Randomness
z_stat, p_value = runstest_1samp(state_counts_values, correction=False)

In [441]:
# แสดงผลลัพธ์
print(f"Z-Statistic: {z_stat}")
print(f"P-Value: {p_value}")

Z-Statistic: 0.2663429164008639
P-Value: 0.7899751238410978


In [445]:
data.to_csv('C:/Nonpara_Project/data.csv', index=False)
