In [4]:
#Importing Libraries
import os
import pandas as pd
import numpy as np

In [6]:
#get working directories
current_dir = os.getcwd() 
#Go one directory up to the root directory
project_root_dir = os.path.dirname(current_dir) 
project_root_dir
# Define paths to the data folders
data_dir = os.path.join(project_root_dir, "Data")
raw_dir = os.path.join(data_dir, "raw")
processed_dir = os.path.join(data_dir, "processed")
# Define paths to results folder
results_dir = os.path.join(project_root_dir, "results")
#define paths to the docs folder
docs_dir = os.path.join(project_root_dir,"docs")

# Creates directories if they do not exist
os.makedirs(raw_dir, exist_ok = True)
os.makedirs(processed_dir, exist_ok = True)
os.makedirs(results_dir, exist_ok = True)
os.makedirs(docs_dir, exist_ok = True)




## Loading the Dataset

In [8]:
crimes_df = pd.read_csv(r"C:\Users\user\Downloads\Crime Against Woman\CrimesOnWomenData.csv")
description_df = pd.read_csv(r"C:\Users\user\Downloads\Crime Against Woman\description.csv")
crimes_df.head(), description_df.head()

(   Unnamed: 0              State  Year  Rape   K&A   DD   AoW   AoM    DV  WT
 0           0     ANDHRA PRADESH  2001   871   765  420  3544  2271  5791   7
 1           1  ARUNACHAL PRADESH  2001    33    55    0    78     3    11   0
 2           2              ASSAM  2001   817  1070   59   850     4  1248   0
 3           3              BIHAR  2001   888   518  859   562    21  1558  83
 4           4       CHHATTISGARH  2001   959   171   70  1763   161   840   0,
    Unnamed: 0 Column Names         Explanation
 0           0        State               State
 1           1         Year                Year
 2           2         Rape   No. of Rape cases
 3           3          K&A  Kidnap And Assault
 4           4           DD        Dowry Deaths)

In [10]:
crimes_df.shape

(736, 10)

In [12]:
crimes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  736 non-null    int64 
 1   State       736 non-null    object
 2   Year        736 non-null    int64 
 3   Rape        736 non-null    int64 
 4   K&A         736 non-null    int64 
 5   DD          736 non-null    int64 
 6   AoW         736 non-null    int64 
 7   AoM         736 non-null    int64 
 8   DV          736 non-null    int64 
 9   WT          736 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 57.6+ KB


In [14]:
crimes_df.columns

Index(['Unnamed: 0', 'State', 'Year', 'Rape', 'K&A', 'DD', 'AoW', 'AoM', 'DV',
       'WT'],
      dtype='object')

## Data Cleaning

In [16]:
# Checking if there are missing values
crimes_df.isnull().sum().sum()

0

In [18]:
# Checking for duplicate
crimes_df.duplicated().sum()

0

### Manually define the short and long column names 

In [21]:
# Manually define the short and long column names as lists
short_names = ['State', 'Year', 'Rape', 'K&A', 'DD', 'AoM', 'AoW', 'DV', 'WT']
long_names = ['State', 'Year', 'No. of Rape cases', 'Kidnap And Assault', 'Dowry Deaths',
              'Assault against modesty of women', 'Assault against women',
              'Domestic violence', 'Women Trafficking']

# Create a mapping using zip()
column_mapping = dict(zip(short_names, long_names))

# Apply the mapping to rename columns
crimes_df.rename(columns=column_mapping, inplace=True)
crimes_df

Unnamed: 0.1,Unnamed: 0,State,Year,No. of Rape cases,Kidnap And Assault,Dowry Deaths,Assault against women,Assault against modesty of women,Domestic violence,Women Trafficking
0,0,ANDHRA PRADESH,2001,871,765,420,3544,2271,5791,7
1,1,ARUNACHAL PRADESH,2001,33,55,0,78,3,11,0
2,2,ASSAM,2001,817,1070,59,850,4,1248,0
3,3,BIHAR,2001,888,518,859,562,21,1558,83
4,4,CHHATTISGARH,2001,959,171,70,1763,161,840,0
...,...,...,...,...,...,...,...,...,...,...
731,731,D&N Haveli,2021,1250,4083,141,2068,417,4731,4
732,732,Daman & Diu,2021,315,904,16,1851,10,501,1
733,733,Delhi UT,2021,2,1,0,5,1,9,0
734,734,Lakshadweep,2021,0,0,0,1,1,3,0


In [24]:
crimes_df.shape

(736, 10)

In [26]:
crimes_df.to_csv('Cleaned_crimes_on_women.csv', index=False)

In [28]:
# Convert all state names to lowercase
crimes_df['State'] = crimes_df['State'].str.lower()

# Display unique states to verify transformation
crimes_df['State'].unique()


array(['andhra pradesh', 'arunachal pradesh', 'assam', 'bihar',
       'chhattisgarh', 'goa', 'gujarat', 'haryana', 'himachal pradesh',
       'jammu & kashmir', 'jharkhand', 'karnataka', 'kerala',
       'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram',
       'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim',
       'tamil nadu', 'tripura', 'uttar pradesh', 'uttarakhand',
       'west bengal', 'a & n islands', 'chandigarh', 'd & n haveli',
       'daman & diu', 'lakshadweep', 'puducherry', 'telangana',
       'd&n haveli', 'delhi ut'], dtype=object)

In [30]:
crimes_df

Unnamed: 0.1,Unnamed: 0,State,Year,No. of Rape cases,Kidnap And Assault,Dowry Deaths,Assault against women,Assault against modesty of women,Domestic violence,Women Trafficking
0,0,andhra pradesh,2001,871,765,420,3544,2271,5791,7
1,1,arunachal pradesh,2001,33,55,0,78,3,11,0
2,2,assam,2001,817,1070,59,850,4,1248,0
3,3,bihar,2001,888,518,859,562,21,1558,83
4,4,chhattisgarh,2001,959,171,70,1763,161,840,0
...,...,...,...,...,...,...,...,...,...,...
731,731,d&n haveli,2021,1250,4083,141,2068,417,4731,4
732,732,daman & diu,2021,315,904,16,1851,10,501,1
733,733,delhi ut,2021,2,1,0,5,1,9,0
734,734,lakshadweep,2021,0,0,0,1,1,3,0


### Checking duplicate

In [43]:
crimes_df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
731    False
732    False
733    False
734    False
735    False
Length: 736, dtype: bool

### remove unnecessary column

In [54]:
crimes_df.drop(columns=['total_crimes'], inplace=True)

In [55]:
crimes_df

Unnamed: 0,State,Year,No. of Rape cases,Kidnap And Assault,Dowry Deaths,Assault against women,Assault against modesty of women,Domestic violence,Women Trafficking
0,andhra pradesh,2001,871,765,420,3544,2271,5791,7
1,arunachal pradesh,2001,33,55,0,78,3,11,0
2,assam,2001,817,1070,59,850,4,1248,0
3,bihar,2001,888,518,859,562,21,1558,83
4,chhattisgarh,2001,959,171,70,1763,161,840,0
...,...,...,...,...,...,...,...,...,...
731,d&n haveli,2021,1250,4083,141,2068,417,4731,4
732,daman & diu,2021,315,904,16,1851,10,501,1
733,delhi ut,2021,2,1,0,5,1,9,0
734,lakshadweep,2021,0,0,0,1,1,3,0


### Reshape data from wide format into long format 

this helps to get some visuals we need and then also to see the values becouse it is in wide format

In [47]:
# Melt (reshape) the data
df_long = crimes_df.melt(id_vars=["State", "Year"], 
                  var_name="Crime Type", 
                  value_name="Value")

# Save reshaped data
final_file = os.path.join(processed_dir, 'reshaped_data.csv')
df_long.to_csv("reshaped_data.csv", index=False) 


In [57]:
reshaped_df=pd.read_csv(r"C:\Users\user\Downloads\Crime\crime_against_on_womens\Data\processed\reshaped_data.csv")
reshaped_df

Unnamed: 0,State,Year,Crime Type,Value
0,andhra pradesh,2001,No. of Rape cases,871
1,arunachal pradesh,2001,No. of Rape cases,33
2,assam,2001,No. of Rape cases,817
3,bihar,2001,No. of Rape cases,888
4,chhattisgarh,2001,No. of Rape cases,959
...,...,...,...,...
5147,d&n haveli,2021,Women Trafficking,4
5148,daman & diu,2021,Women Trafficking,1
5149,delhi ut,2021,Women Trafficking,0
5150,lakshadweep,2021,Women Trafficking,0


In [59]:
reshaped_df.isnull().sum().sum()

0

In [61]:
reshaped_df.duplicated().sum()

0