In [1]:
#string manipulation
from collections import Counter

def count_word_frequency(file_path, top_n=5):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read().lower()
    
    words = text.split()
    word_counts = Counter(words)
    
    return word_counts.most_common(top_n)

# Example usage
file_path = "str.txt"  # Replace with your file path
top_words = count_word_frequency(file_path, top_n=5)
print(top_words)


[('the', 30), ('and', 28), ('in', 20), ('as', 17), ('elephants', 14)]


In [2]:
def count_even_odd(numbers):
    count = {"even": 0, "odd": 0}
    
    for num in numbers:
        if num % 2 == 0:
            count["even"] += 1
        else:
            count["odd"] += 1
    
    return count

# Example usage
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
result = count_even_odd(numbers)
print(result)


{'even': 5, 'odd': 5}


In [1]:
import csv

def count_csv_rows(file_path):
    try:
        with open(file_path, mode='r', newline='') as file:
            reader = csv.reader(file)
            row_count = sum(1 for row in reader)
        print(f"Total rows: {row_count}")
    except FileNotFoundError:
        print("Error: File not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
file_path = "data.csv"  # Replace with the actual CSV file path
count_csv_rows(file_path)


Total rows: 50986


In [11]:
import re

def extract_emails(file_path):
    try:
        with open(file_path, "r") as file:
            content = file.read()
        
        # Regular expression for valid email addresses
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        emails = re.findall(email_pattern, content)
        
        print("Extracted Emails:")
        for email in emails:
            print(email)
    
    except FileNotFoundError:
        print("Error: File not found.")
    except Exception as e:
        print(f"An             occurred: {e}")

# Example usage
file_path = "sample.txt"  # Replace with the actual file path
extract_emails(file_path)


Extracted Emails:
support@example.com
sales@shopnow.com
john.doe123@gmail.com
alice_smith@outlook.com
hr@company.org
it_support@techservices.net
ceo@bigcorp.com


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder

# Load the dataset
df = pd.read_csv('employee_attrition.csv')

# Display basic information
print(df.info())
print(df.describe())

# Identify categorical and numerical features
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

# Convert categorical columns to numerical using Label Encoding
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Scale numerical columns using StandardScaler
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Feature engineering: Create new meaningful features
if 'Age' in df.columns and 'EducationYears' in df.columns:
    df['TotalWorkingYears'] = df['Age'] - df['EducationYears']

# Check for skewness and apply log transformation if needed
skewed_cols = df[numerical_cols].apply(lambda x: x.skew()).sort_values(ascending=False)
skewed_cols = skewed_cols[skewed_cols > 0.75].index

df[skewed_cols] = np.log1p(df[skewed_cols])

# Save the transformed dataset to a new CSV file
df.to_csv('employee_attrition_transformed.csv', index=False)

# Visualizations
sns.pairplot(df[numerical_cols].dropna())
plt.show()

sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

# Correlation analysis
print(df.corr())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

  result = func(self.values, **kwargs)


In [1]:
import pandas as pd

def process_employee_salaries(file_path, output_file="filtered_salaries.csv"):
    try:
        df = pd.read_csv(file_path)  
        print("Summary Statistics:\n", df.describe())  
        
        filtered_df = df[df["Salary"] > 50000]  
        filtered_df.to_csv(output_file, index=False)  
        print(f"Filtered results saved to {output_file}.")
        
        return filtered_df
    except FileNotFoundError:
        print("Error: File not found.")
    except KeyError:
        print("Error: 'Salary' column not found in CSV.")

def main():
    file_path = "Employee_Salary_Dataset.csv"  
    process_employee_salaries(file_path)

if __name__ == "__main__":
    main()


Summary Statistics:
               ID  Experience_Years        Age        Salary
count  35.000000          35.00000  35.000000  3.500000e+01
mean   18.000000           9.20000  35.485714  2.059147e+06
std    10.246951           7.55295  14.643552  3.170124e+06
min     1.000000           1.00000  17.000000  3.000000e+03
25%     9.500000           2.50000  22.500000  2.250000e+04
50%    18.000000           6.00000  29.000000  2.500000e+05
75%    26.500000          15.00000  53.500000  3.270000e+06
max    35.000000          27.00000  62.000000  1.000000e+07
Filtered results saved to filtered_salaries.csv.
