In [1]:
# import bibs
import pandas as pd
import os
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from matplotlib.patches import Patch
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import TruncatedSVD

**In your project, the following work steps will overlap and combine:**
- Feature selection
- Sampling
- Feature engineering

**1. Selection of features:**

You must select the right features (columns) that have the greatest impact on the main outcome of the project (**sales of tools for the metal industry**). There should be a causal relationship between the features and sales.

<u>Your task will be to select features from the following areas:<u>
- Employment
- Establishment
- Payroll
- Economic strength


**2. Filtering:**

Identify the most important occupations and industries that have a **realistically high tool consumption**

**3. Feature engineering / construction / aggregation:**

In all three datasets there are duplicates in the FIPS column as further dimensions are mapped in the naics or occ_code columns. This makes specific combinations of FIPS, NAICS, Occupation and a specific feature in a row possible. <br>

Due to the requirement to use a **cluster algorithm**, the final dataframe must have a special form. **Each region (FIPS number) may only occur once in the df.**
You must therefore prioritize the industries and occupations in this step and create special features that contain the Occupation or Industry dimension within them.

This is called **Pivoting or Wide-Format-Transformation**

<u>Possible features could be as listed below:<u>

- Number of employees in foundries (NAICS: 3315)
- Number of grinders (OCC: 51-9022)
- Number of grinders (OCC: 51-9022) in foundries (NAICS: 3315)

In [7]:
!git clone https://github.com/motasem00/data-driven-modelling.git

Cloning into 'data-driven-modelling'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 47 (delta 11), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (47/47), 36.73 MiB | 6.41 MiB/s, done.
Resolving deltas: 100% (11/11), done.


# Loading data

In [8]:
working_directory = os.getcwd()
path = working_directory
path

'/content'

In [9]:
gdp_data = pd.read_pickle("/content/data-driven-modelling/data/raw/gdp.pickle")
df_pattern = pd.read_pickle("/content/data-driven-modelling/data/raw/naics_pattern.pickle")

df_occupation_1 = pd.read_pickle("/content/data-driven-modelling/data/raw/naics_occupation_part1.pickle")
df_occupation_2 = pd.read_pickle("/content/data-driven-modelling/data/raw/naics_occupation_part2.pickle")
df_occupation_3 = pd.read_pickle("/content/data-driven-modelling/data/raw/naics_occupation_part3.pickle")

In [10]:
county_data = pd.read_pickle("/content/data-driven-modelling/data/raw/county.pickle")
State_data = pd.read_pickle("/content/data-driven-modelling/data/raw/state.pickle")

In [11]:
df_pattern['FIPS'] = df_pattern['FIPS'].astype(str)
unique_lengths = df_pattern['FIPS'].apply(len).unique()
unique_lengths

array([4, 5])

In [12]:
def add_zeros(code):
    if len(code) == 3:
        return '00' + code
    elif len(code) == 4:
        return '0' + code
    elif len(code) == 1:
        return '0000' + code
    return code

In [13]:
df_pattern['FIPS'] = df_pattern['FIPS'].apply(add_zeros)

In [14]:
df_occupation = pd.concat([df_occupation_1, df_occupation_2, df_occupation_3], ignore_index=True)
df_occupation.head(2)

Unnamed: 0,FIPS,State_GEOID,naics,NAICS_TITLE,emp_total_county_naics,OCC_CODE,OCC_TITLE,emp_occupation,state_name
0,12999,12,5613,Employment Services,1436559,49-9071,"Maintenance and Repair Workers, General",20639.514235,
1,6999,6,5613,Employment Services,729335,49-9071,"Maintenance and Repair Workers, General",9414.167765,


# My project plan will be as following:

* Upload and Link the datasets.
* And I will try to understand the data and find a way of matching things and find a working model.
* Then I will try to clean the data and drop the un-necessary columns.



We will start with the GDP Data

In [15]:
gdp_data.columns

Index(['FIPS', 'GeoName', 'Region', 'TableName', 'LineCode',
       'IndustryClassification', 'Description', 'Unit', '2017', '2018', '2019',
       '2020', '2021', '2022'],
      dtype='object')

In [16]:
columns_to_drop_gdp = [
    'GeoName', 'Region', 'TableName', 'LineCode','Unit'
]

# Revomve not needed columns
filtered_gdp = gdp_data.drop(columns=columns_to_drop_gdp, errors='ignore')

# Show the rest
print(filtered_gdp.columns)

Index(['FIPS', 'IndustryClassification', 'Description', '2017', '2018', '2019',
       '2020', '2021', '2022'],
      dtype='object')


Then I Will have a look at the pattern dataset

In [17]:
df_pattern.columns

Index(['State_GEOID', 'County_GEOID', 'FIPS', 'naics_2', 'naics',
       'DESCRIPTION', 'emp_nf', 'emp', 'qp1_nf', 'qp1', 'ap_nf', 'ap', 'est',
       'n<5', 'n5_9', 'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n250_499',
       'n500_999', 'n1000', 'n1000_1', 'n1000_2', 'n1000_3', 'n1000_4'],
      dtype='object')

In [18]:
columns_to_drop_pattern = [
    'State_GEOID', 'County_GEOID','naics_2', 'emp_nf', 'qp1_nf', 'qp1', 'ap_nf',
    'n<5', 'n5_9', 'n10_19', 'n20_49', 'n50_99', 'n100_249', 'n250_499',
       'n500_999', 'n1000', 'n1000_1', 'n1000_2', 'n1000_3', 'n1000_4'
]

# Revomve not needed columns
filtered_pattern = df_pattern.drop(columns=columns_to_drop_pattern, errors='ignore')

# Show the rest
print(filtered_pattern.columns)

Index(['FIPS', 'naics', 'DESCRIPTION', 'emp', 'ap', 'est'], dtype='object')


After looking in the County_Data

*   The FIPS column datatype should be changed to int

In [19]:
filtered_pattern = filtered_pattern.astype({"FIPS": int})

In [20]:
filtered_pattern.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54727 entries, 0 to 188589
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   FIPS         54727 non-null  int64 
 1   naics        54727 non-null  object
 2   DESCRIPTION  54727 non-null  object
 3   emp          54727 non-null  int64 
 4   ap           54727 non-null  int64 
 5   est          54727 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 2.9+ MB


After looking in the County_Data

*   The column named "GEOID", should be remaned to "FIPS"
*   The FIPS column datatype should be changed to int


In [21]:
county_data.rename(columns={'GEOID': 'FIPS'}, inplace=True)

In [22]:
county_data.columns

Index(['STATEFP', 'COUNTYFP', 'COUNTYNS', 'AFFGEOID', 'FIPS', 'NAME', 'LSAD',
       'ALAND', 'AWATER', 'geometry'],
      dtype='object')

In [23]:
county_data = county_data.astype({"FIPS": int})

In [24]:
county_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 3233 entries, 0 to 3232
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   STATEFP   3233 non-null   object  
 1   COUNTYFP  3233 non-null   object  
 2   COUNTYNS  3233 non-null   object  
 3   AFFGEOID  3233 non-null   object  
 4   FIPS      3233 non-null   int64   
 5   NAME      3233 non-null   object  
 6   LSAD      3233 non-null   object  
 7   ALAND     3233 non-null   int64   
 8   AWATER    3233 non-null   int64   
 9   geometry  3233 non-null   geometry
dtypes: geometry(1), int64(3), object(6)
memory usage: 252.7+ KB


### Let's now have a look at the Occupation Dataset

In [25]:
columns_to_drop_occupation = [
    'State_GEOID',  'emp_total_county_naics', 'state_name'
]

# Revomve not needed columns
filtered_occupation = df_occupation.drop(columns=columns_to_drop_occupation, errors='ignore')

# Show the rest
print(filtered_occupation.columns)

Index(['FIPS', 'naics', 'NAICS_TITLE', 'OCC_CODE', 'OCC_TITLE',
       'emp_occupation'],
      dtype='object')


In [26]:
filtered_occupation.rename(columns={'GEOID': 'FIPS'}, inplace=True)

# Tool consumption

Here, the individual industries must be evaluated by **domain knowledge** with regard to their potential tool consumption.<br>
We will insert random values between 0 and 10 as an example

**We gonna do that 2 times**

*   With the Industry Pattern Dataset
*   With the Occupation Dataset

<br>

---




**1.   FIRST Start with the Industries Pattern **




In [27]:
# Generate random values for each NAICS and assign them to each entry in the group
random_values = {naics: np.random.randint(0, 11) for naics in filtered_pattern['naics'].unique()}
filtered_pattern['tool_consumption'] = filtered_pattern['naics'].map(random_values)
filtered_pattern.head(3)

Unnamed: 0,FIPS,naics,DESCRIPTION,emp,ap,est,tool_consumption
0,1001,1133,Logging,68,4563,7,2
1,1001,2123,Nonmetallic Mineral Mining and Quarrying,87,5144,6,0
2,1001,2211,"Electric Power Generation, Transmission and Di...",129,16342,4,4


In [28]:
# Generate random values for each NAICS and assign them to each entry in the group
random_values = {naics: np.random.randint(0, 11) for naics in filtered_occupation['naics'].unique()}
filtered_occupation['tool_consumption_occ'] = filtered_occupation['naics'].map(random_values)
filtered_occupation.head(3)

Unnamed: 0,FIPS,naics,NAICS_TITLE,OCC_CODE,OCC_TITLE,emp_occupation,tool_consumption_occ
0,12999,5613,Employment Services,49-9071,"Maintenance and Repair Workers, General",20639.514235,10
1,6999,5613,Employment Services,49-9071,"Maintenance and Repair Workers, General",9414.167765,10
2,36999,5613,Employment Services,49-9071,"Maintenance and Repair Workers, General",8332.850279,10


## Industries Before Ranking

In [29]:
pattern_result = filtered_pattern.groupby(['naics', 'DESCRIPTION', 'tool_consumption']).agg(
    emp_sum=('emp', 'sum'),            # Sum of values in the 'emp' column
    unique_FIPS=('FIPS', 'nunique')    # Count of unique values in the 'FIPS' column
).sort_values(by='emp_sum', ascending=False).reset_index()

# Display the first 20 rows
pattern_result.head(20)

Unnamed: 0,naics,DESCRIPTION,tool_consumption,emp_sum,unique_FIPS
0,5613,Employment Services,3,7061425,1306
1,2382,Building Equipment Contractors,3,2171001,2756
2,5617,Services to Buildings and Dwellings,1,2004960,2530
3,5413,"Architectural, Engineering, and Related Services",4,1579218,1924
4,4411,Automobile Dealers,8,1245435,1953
5,7139,Other Amusement and Recreation Industries,8,1092894,2182
6,8111,Automotive Repair and Maintenance,1,902870,2725
7,2381,"Foundation, Structure, and Building Exterior C...",10,901696,2313
8,2383,Building Finishing Contractors,1,811812,2040
9,4238,"Machinery, Equipment, and Supplies Merchant Wh...",8,739134,1947


In [30]:
# Create a copy of the DataFrame
pattern_result_copy = pattern_result.copy()

# Calculate rank for all columns starting from 'tool_consumption', with equal values having the same rank
for column in pattern_result_copy.columns[pattern_result_copy.columns.get_loc('tool_consumption'):]:
    rank_column_name = f'rank_{column}'
    pattern_result_copy[rank_column_name] = pattern_result_copy[column].rank(method='min', ascending=False).astype(int)

# Display the first few rows of the copied data
pattern_result_copy.head()

Unnamed: 0,naics,DESCRIPTION,tool_consumption,emp_sum,unique_FIPS,rank_tool_consumption,rank_emp_sum,rank_unique_FIPS
0,5613,Employment Services,3,7061425,1306,47,1,14
1,2382,Building Equipment Contractors,3,2171001,2756,47,2,1
2,5617,Services to Buildings and Dwellings,1,2004960,2530,59,3,3
3,5413,"Architectural, Engineering, and Related Services",4,1579218,1924,37,4,10
4,4411,Automobile Dealers,8,1245435,1953,13,5,8


## Industries After Ranking

Adding Weights and for sure the highest weight will go for the **Rank Tool Consumption**

In [31]:
weights = {
    'rank_tool_consumption': 0.5,   # Weight for the ranking column 'tool_consumption'
    'rank_emp_sum': 0.3,            # Weight for the ranking column 'emp_sum'
    'rank_unique_FIPS': 0.2,         # Weight for the ranking column 'unique_FIPS'
}

In [32]:
# calculate the weighted sum
pattern_result_copy['Weighted_Sum'] = (pattern_result_copy['rank_tool_consumption'] * weights['rank_tool_consumption'] +
                          pattern_result_copy['rank_emp_sum'] * weights['rank_emp_sum'] +
                          pattern_result_copy['rank_unique_FIPS'] * weights['rank_unique_FIPS']
                          ) # Display the first few rows
pattern_sorted = pattern_result_copy.sort_values(by='Weighted_Sum', ascending=True)

In [33]:
#Final Sorted Top Industries to be used later for filtering industies
pattern_sorted.head(10)

Unnamed: 0,naics,DESCRIPTION,tool_consumption,emp_sum,unique_FIPS,rank_tool_consumption,rank_emp_sum,rank_unique_FIPS,Weighted_Sum
7,2381,"Foundation, Structure, and Building Exterior C...",10,901696,2313,1,8,5,3.9
5,7139,Other Amusement and Recreation Industries,8,1092894,2182,13,6,6,9.5
4,4411,Automobile Dealers,8,1245435,1953,13,5,8,9.6
18,4231,Motor Vehicle and Motor Vehicle Parts and Supp...,10,390199,1030,1,19,20,10.2
20,3231,Printing and Related Support Activities,10,356146,1116,1,21,18,10.4
9,4238,"Machinery, Equipment, and Supplies Merchant Wh...",8,739134,1947,13,10,9,11.3
10,2362,Nonresidential Building Construction,8,657152,1687,13,11,11,12.0
35,8113,Commercial and Industrial Machinery and Equipm...,10,189786,1615,1,36,12,13.7
31,3399,Other Miscellaneous Manufacturing,10,215641,900,1,32,23,14.7
25,2373,"Highway, Street, and Bridge Construction",9,277718,905,10,26,22,17.2


In [35]:
top20_industries = pattern_sorted[:20]['naics'].tolist()
top20_industries

['2381',
 '7139',
 '4411',
 '4231',
 '3231',
 '4238',
 '2362',
 '8113',
 '3399',
 '2373',
 '3320A1',
 '5413',
 '2131',
 '4881',
 '4413',
 '3330A1',
 '2382',
 '3211',
 '2212',
 '3119']

**The following industries needs to be cleaned**

*   3320A1
*   3330A1

Will need to use the link here https://www.bls.gov/oes/2023/may/naics4_3320A2.htm <br> to Check what those 6 dig codes represesnts actually in 4 gid





In [46]:
# Adding all needed mappings
naics_expanding = {
    '3330A1': {
        'codes': ['3331', '3332', '3334', '3339'],
        'description': [
            'Agricultural Machinery',
            'Industrial Machinery',
            'Metalworking Machinery',
            'Other Machinery'
        ]
    },
    '3330A1': {
        'codes': ['3331', '3332', '3334', '3339'],
        'description': [
            'Agricultural Implement Manufacturing',
            'Industrial Machinery Manufacturing',
            'Metalworking Machinery Manufacturing',
            'Other General Purpose Machinery Manufacturing'
        ]
    }
}



---





**2.   SECOND Move to the Occupation **




In [36]:
filtered_occupation.head()

Unnamed: 0,FIPS,naics,NAICS_TITLE,OCC_CODE,OCC_TITLE,emp_occupation,tool_consumption_occ
0,12999,5613,Employment Services,49-9071,"Maintenance and Repair Workers, General",20639.514235,10
1,6999,5613,Employment Services,49-9071,"Maintenance and Repair Workers, General",9414.167765,10
2,36999,5613,Employment Services,49-9071,"Maintenance and Repair Workers, General",8332.850279,10
3,6037,8111,Automotive Repair and Maintenance,49-3023,Automotive Service Technicians and Mechanics,5913.423292,9
4,48999,5613,Employment Services,49-9071,"Maintenance and Repair Workers, General",5770.378034,10


In [41]:
occupation_result = filtered_occupation.groupby(['OCC_CODE','OCC_TITLE', 'tool_consumption_occ']).agg(
    emp_occ_sum=('emp_occupation', 'sum'),             # Sum of values in the 'emp' column
    unique_FIPS=('FIPS', 'nunique'),    # Count of unique values in the 'FIPS' column
).sort_values(by='emp_occ_sum', ascending=False).reset_index()

# Display the first 20 rows
occupation_result.head(20)

Unnamed: 0,OCC_CODE,OCC_TITLE,tool_consumption_occ,emp_occ_sum,unique_FIPS
0,49-3023,Automotive Service Technicians and Mechanics,9,251651.552293,2764
1,49-9071,"Maintenance and Repair Workers, General",10,128624.254913,1528
2,51-4121,"Welders, Cutters, Solderers, and Brazers",2,123496.98398,2475
3,49-3021,Automotive Body and Related Repairers,9,103113.463321,2725
4,51-4041,Machinists,4,101408.046543,1295
5,51-4072,"Molding, Coremaking, and Casting Machine Sette...",8,95152.178492,1180
6,49-9071,"Maintenance and Repair Workers, General",2,86146.54514,2688
7,17-2141,Mechanical Engineers,2,78136.266519,2343
8,49-9041,Industrial Machinery Mechanics,7,78090.253095,1889
9,51-4121,"Welders, Cutters, Solderers, and Brazers",7,75999.017971,1880


In [42]:
# Create a copy of the DataFrame
occupation_result_copy = occupation_result.copy()

# Calculate rank for all columns starting from 'tool_consumption', with equal values having the same rank
for column in occupation_result_copy.columns[occupation_result_copy.columns.get_loc('tool_consumption_occ'):]:
    rank_column_name = f'rank_{column}'
    occupation_result_copy[rank_column_name] = occupation_result_copy[column].rank(method='min', ascending=False).astype(int)

# Display the first few rows of the copied data
occupation_result_copy.head()

Unnamed: 0,OCC_CODE,OCC_TITLE,tool_consumption_occ,emp_occ_sum,unique_FIPS,rank_tool_consumption_occ,rank_emp_occ_sum,rank_unique_FIPS
0,49-3023,Automotive Service Technicians and Mechanics,9,251651.552293,2764,64,1,4
1,49-9071,"Maintenance and Repair Workers, General",10,128624.254913,1528,1,2,145
2,51-4121,"Welders, Cutters, Solderers, and Brazers",2,123496.98398,2475,432,3,25
3,49-3021,Automotive Body and Related Repairers,9,103113.463321,2725,64,4,14
4,51-4041,Machinists,4,101408.046543,1295,353,5,200


In [43]:
weights = {
    'rank_tool_consumption_occ': 0.5,   # Weight for the ranking column 'tool_consumption'
    'rank_emp_occ_sum': 0.3,            # Weight for the ranking column 'emp_sum'
    'rank_unique_FIPS': 0.2,         # Weight for the ranking column 'unique_FIPS'
}


In [44]:
# calculate the weighted sum
occupation_result_copy['Weighted_Sum_Occ'] = (occupation_result_copy['rank_tool_consumption_occ'] * weights['rank_tool_consumption_occ'] +
                          occupation_result_copy['rank_emp_occ_sum'] * weights['rank_emp_occ_sum'] +
                          occupation_result_copy['rank_unique_FIPS'] * weights['rank_unique_FIPS']
                          ) # Display the first few rows
occupation_sorted = occupation_result_copy.sort_values(by='Weighted_Sum_Occ', ascending=True)

In [45]:
occupation_sorted.head(20)

Unnamed: 0,OCC_CODE,OCC_TITLE,tool_consumption_occ,emp_occ_sum,unique_FIPS,rank_tool_consumption_occ,rank_emp_occ_sum,rank_unique_FIPS,Weighted_Sum_Occ
1,49-9071,"Maintenance and Repair Workers, General",10,128624.254913,1528,1,2,145,30.1
0,49-3023,Automotive Service Technicians and Mechanics,9,251651.552293,2764,64,1,4,33.1
20,51-4121,"Welders, Cutters, Solderers, and Brazers",10,39638.803736,1518,1,21,146,36.0
3,49-3021,Automotive Body and Related Repairers,9,103113.463321,2725,64,4,14,36.0
14,49-3031,Bus and Truck Mechanics and Diesel Engine Spec...,9,47908.375941,2822,64,15,1,36.7
12,51-4041,Machinists,10,57753.89182,1380,1,13,171,38.6
31,17-2112,Industrial Engineers,10,32029.465163,1516,1,32,147,39.5
30,49-9041,Industrial Machinery Mechanics,10,32750.585733,1508,1,31,151,40.0
18,51-4081,"Multiple Machine Tool Setters, Operators, and ...",10,42270.205822,1374,1,19,172,40.6
42,17-2141,Mechanical Engineers,10,26362.201839,1509,1,43,150,43.4


**Now, we will filter the Occupations for the ones that is gonna use our products**

*   Assume some occupations that are important like
  1.   '51-4121' > Welders, Cutters, Solderers, and Brazers
  2.   '47-2221' > Structural Iron and Steel Workers
  3.   '51-2041' > Structural Metal Fabricators and Fitters
  4.   '49-3021' > Automotive Body and Related Repairers
  5.   '51-4041' > Machinists
  6.   '49-9041' > Industrial Machinery Mechanics
  7.   '49-9071' > Maintenance and Repair Workers, General
  8.   '51-4081' > Multiple Machine Tool Setters, Operators, and Tenders, Metal and Plastic
  9.   '47-2211' > Sheet Metal Workers
  10.  '49-3031' > Bus and Truck Mechanics and Diesel Engine Specialists
  11.   '51-4033' > Grinding, Lapping, Polishing, and Buffing Machine Tool Setters, Operators, and Tenders, Metal and Plastic
  12.   '49-3023' > Automotive Service Technicians and Mechanics
  13.   '47-2011' > Boilermakers
  14.   '51-4122' > Welding, Soldering, and Brazing Machine Setters, Operators, and Tenders
  15.   '51-9021' > Crushing, Grinding, and Polishing Machine Setters, Operators, and Tenders
  16.   '51-4031' > Cutting, Punching, and Press Machine Setters, Operators, and Tenders, Metal and Plastic
  17.   '49-3011' > Aircraft Mechanics and Service Technicians
  18.   '51-4111' > Tool and Die Makers
  19.   '51-9032' > Cutting and Slicing Machine Setters, Operators, and Tenders
  20.   '49-9043' > Maintenance Workers, Machinery


# Feature overview

In [None]:
# # دمج البيانات الجغرافية مع الـ GDP
# merged_data = pd.merge(county_data, filtered_gdp, on='FIPS', how='inner')

# # دمج البيانات الصناعية
# merged_data = pd.merge(merged_data, filtered_pattern, on='FIPS', how='inner')

# # دمج بيانات الوظائف
# #merged_data = pd.merge(merged_data, filtered_occupation, on='FIPS', how='inner')

# # دمج بيانات الولايات
# #merged_data = pd.merge(merged_data, state_data, on='STATEFP', how='inner')


In [None]:
# filtered_occ_final = filtered_occupation[filtered_occupation['OCC_CODE'] == occ_top10]
# filtered_occ_final.head()

**Features:**

- **emp:** employment + noise flag
- **qp1:** first quarter payroll + noise flag
- **ap:** anual payroll + noise flag
- **est:** number of establishments (total and for different employment sizes)

**Features:**

- **emp_total_county_naics:** total emplyment for all existing NAICS Codes
- **emp_occupation:** specific employment for specific combination of occupation and industry

# Feature selection

We will select an exemplary feature in this code.<br>  

We will focus on the feature **number of companies (est)**. Regions where specific industries are located or concentrated will usually also consume tools.

What is NAICS 3320A2 ? -- 3323, 3323

https://www.bls.gov/oes/2023/may/naics4_3320A2.htm

Fabricated Metal Product Manufacturing (3323 and 3324 only)

- 3323: Architectural and Structural Metals Manufacturing
- 3324: Boiler, Tank, and Shipping Container Manufacturing

# Feature construction

**How do you choose the most important and relevant industries?**

Viewed as a whole, the project has requirements from various sides:
- Domain: tool consumption / employment
- Modeling: clustering

<br> For the **domain side**, it makes sense to choose industries that have a high share of employment and a high consumption of tools.
<br> For the **modeling side**, it makes sense to cover as many regions as possible. Since we will later use a maximum of 15 features, it makes no sense to select industries that only occur in a very small number of regions.

## Domain

### Tool consumption

Here, the individual industries must be evaluated by **domain knowledge** with regard to their potential tool consumption.<br>
We will insert random values between 0 and 10 as an example

# **Loading Map & Ploting it**

We will load the map csv amd plot it

Then apply then will merge it to the other dateframes

In [None]:
# دمج البيانات الجغرافية مع الـ GDP
merged_data = pd.merge(county_data, gdp_data, on='GEOID', how='inner')

# دمج البيانات الصناعية
merged_data = pd.merge(merged_data, pattern_data, on='FIPS', how='inner')

# دمج بيانات الوظائف
merged_data = pd.merge(merged_data, occupation_data, on='FIPS', how='inner')

# دمج بيانات الولايات
merged_data = pd.merge(merged_data, state_data, on='STATEFP', how='inner')


# feature_df

# Scaling

# Dimension reduction

## PCA

In [None]:
# This code extracts the top 10 and the next 10 'OCC_CODE' values from the weighted ranked occupation data and stores them
# in two separate lists: 'occ_top10' and 'occ_top10_20'.
# It also defines two lists of NAICS codes: 'naics_top6' for the top 6 NAICS codes and 'naics_top_metall' for NAICS codes
# related to the metallurgical industry.
# # Finally, it prints the contents of the 'occ_top10', 'occ_top10_20', 'naics_top6', and 'naics_top_metall' lists.

occ_top10 = ranked_occupation_data_weighted['OCC_CODE'][:10].tolist()
occ_top20 = ['51-4121', '47-2221', '51-2041', '49-3021', '51-4041', '49-9041', '49-9071', '51-4081', '47-2211', '49-3031']
# #occ_top10_20 = ranked_occupation_data_weighted['OCC_CODE'][10:20].tolist()
# occ_top10_20 = ['51-4033', '49-3023', '47-2011', '51-4122', '51-9021', '51-4031', '49-3011', '51-4111', '51-9032', '49-9043']
# naics_top6 = ['2382','8111','3320A2','3330A1','3327','3363']
# naics_top_metall = ['3320A1','3335','3364','3362','3315','3366','2379','3336','3311','3314']
# print(occ_top10)
# print(occ_top10_20)
# print(naics_top6)
# print(naics_top_metall)