In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('css_properties_output2.csv')

# View the first few rows of the dataset
df.head()

Unnamed: 0,selector,display,padding,font-family,color,font-size
0,".about__content--style2 .about__content-inner,...",flex,,,,
1,".footer__app-inner, .account__social-btn, .sub...",,,,,
2,".roadmap--style1 .roadmap__item-header, .accor...",,,,,
3,".about--style3 .about__content ul li, .roadmap...",,,,,
4,".about__icon-inner, .social__link--style4, .so...",,,,,


In [2]:
# Check data types and look for missing values
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149 entries, 0 to 1148
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   selector     1149 non-null   object
 1   display      45 non-null     object
 2   padding      82 non-null     object
 3   font-family  31 non-null     object
 4   color        207 non-null    object
 5   font-size    93 non-null     object
dtypes: object(6)
memory usage: 54.0+ KB
None


In [3]:
# Check for missing data
print(df.isnull().sum())


selector          0
display        1104
padding        1067
font-family    1118
color           942
font-size      1056
dtype: int64


In [4]:
# Impute missing values for each column
df['display'] = df['display'].fillna(df['display'].mode()[0])  # Using the mode for display
df['padding'] = df['padding'].fillna('0px')  # Using '0px' for padding
df['font-family'] = df['font-family'].fillna('"sans-serif"')  # Using 'sans-serif' for font-family
df['color'] = df['color'].fillna('#000000')  # Using black for color
df['font-size'] = df['font-size'].fillna('16px')  # Using '16px' for font-size


In [5]:
print(df.isnull().sum())


selector       0
display        0
padding        0
font-family    0
color          0
font-size      0
dtype: int64


In [6]:
df.head()

Unnamed: 0,selector,display,padding,font-family,color,font-size
0,".about__content--style2 .about__content-inner,...",flex,0px,"""sans-serif""",#000000,16px
1,".footer__app-inner, .account__social-btn, .sub...",none,0px,"""sans-serif""",#000000,16px
2,".roadmap--style1 .roadmap__item-header, .accor...",none,0px,"""sans-serif""",#000000,16px
3,".about--style3 .about__content ul li, .roadmap...",none,0px,"""sans-serif""",#000000,16px
4,".about__icon-inner, .social__link--style4, .so...",none,0px,"""sans-serif""",#000000,16px


In [9]:
import pandas as pd
import re

# Define the conversion function for padding and font-size
def convert_padding_size(value):
    if pd.isna(value):
        return None  # Handle NaN values

    value = str(value).strip()  # Ensure value is string and strip any extra spaces

    # Extract the number and the unit from the padding value
    match = re.match(r"([-+]?\d*\.?\d+)([a-zA-Z%]+)?", value)
    if match:
        number = float(match.group(1))
        unit = match.group(2) or 'px'  # Default to 'px' if no unit is found

        # Handle different units (convert to a common unit, e.g., px)
        if unit == 'px':
            return number
        elif unit == 'em':
            return number * 16  # Convert em to px (assuming 1em = 16px)
        elif unit == 'rem':
            return number * 16  # Convert rem to px (assuming 1rem = 16px)
        elif unit == '%':
            # Assuming a reference width of 1000px for percentage
            return number * 10  # Convert % to px based on 1000px parent width
        else:
            raise ValueError(f"Unhandled unit: {unit}")
    else:
        return None  # Return None if unable to parse


# Apply the conversion function to the 'padding' column
df['padding'] = df['padding'].astype(str).apply(convert_padding_size)

# Clean 'font-size' column (integrate the function we discussed earlier)
df['font-size'] = df['font-size'].apply(convert_font_size)

# Clean 'font-size' column and convert to numeric, handling non-numeric values
df['font-size'] = pd.to_numeric(df['font-size'], errors='coerce')

# Drop rows with NaN values in 'font-size' or 'padding'
df.dropna(subset=['font-size', 'padding'], inplace=True)

# Optionally, you can print the DataFrame to check the results
print(df.head())


ValueError: Unhandled unit: %

In [None]:
df.head()

In [None]:
print("Original DataFrame:")
print(df)

# One-Hot Encoding categorical variables
df_encoded = pd.get_dummies(df, columns=['display', 'font-family', 'color'], drop_first=True)

# Display the encoded DataFrame
print("\nEncoded DataFrame:")
df_encoded.head()

In [None]:
df_encoded.to_csv('encoded_css_data2.csv', index=False)