## LUKMAN ALFARIDZI ##

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
data = pd.read_csv('regowlproduct.csv')

In [None]:
import pandas as pd

# Provide the path to your CSV file
PATH = "C:\\Users\\Lukman\\regowlproduct.csv"

# Specify the delimiter as a semicolon
raw_df = pd.read_csv(PATH, encoding='windows-1252', sep=';')

# Make a copy of the DataFrame
df = raw_df.copy()

# Set display options if needed
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

# Display the first few rows of the DataFrame
df.head()


In [None]:
# DATA CLEANING AND EXPLORATORY

In [None]:
# Delete spaces in column names
df.columns = df.columns.str.replace(' ', '')

In [None]:
df = df.rename(columns={'ï»¿Province': 'Province'})

In [None]:
df['Total_Agent'] = pd.to_numeric(df['Total_Agent'], errors='coerce')

In [None]:
df.info()
df.describe()

In [None]:
print(df.columns)

# DATA ANALYSIS

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Total_Revenue' in ascending order to get the lowest values first
sorted_df = df.sort_values(by='Total_Revenue', ascending=True)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest total revenue by province
ax = sns.barplot(x='Total_Revenue', y='Province', data=df)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars with padding
padding = 10000000  # Adjust the padding value as needed

for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_width() + padding, p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Total Revenue each Province', fontsize=16)
plt.xlabel('Total Revenue (in Rupiah)', fontsize=14)
plt.ylabel('Province', fontsize=14)

plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Increase the figure size
plt.figure(figsize=(18, 8))

# Create a bar plot to show the total count of 'Total Agent' for each province
ax = sns.barplot(data=df, x='Province', y='Total_Agent', estimator=sum)

plt.title('Total Agents by Province', fontsize=16)
plt.xlabel('Province', fontsize=14)
plt.ylabel('Total Agents', fontsize=14)

# Rotate x-axis labels for better readability
plt.xticks(rotation=90, fontsize=12)

# Annotate the data values on the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.0f}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='baseline', fontsize=12, color='black')

plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['Total_Agent'], bins=10, kde=True)
plt.title('Distribution of Total Agents')
plt.xlabel('Total Agents')
plt.ylabel('Frequency')
plt.show()

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import norm

# Create a scatter plot
plt.scatter(df['Total_Agent'], df['Total_Revenue'])
plt.xlabel('Total Agent')
plt.ylabel('Total Revenue')

# Fit a linear regression model
model = LinearRegression()
X = df[['Total_Agent']]
y = df['Total_Revenue']
model.fit(X, y)

# Generate points for the regression line
x = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
y_pred = model.predict(x)

# Plot the regression line
plt.plot(x, y_pred, color='red')

# Calculate the standard error (sigma) and confidence interval
sigma = np.std(y - model.predict(X))
y_err = sigma * norm.ppf(0.975)

# Plot the confidence interval
plt.fill_between(x.flatten(), (y_pred - y_err), (y_pred + y_err), color='red', alpha=0.2)

plt.title('Total Revenue vs. Total Agent')
plt.xlabel('Total Agent')
plt.ylabel('Total Revenue')

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Lowest_Sales' in descending order
sorted_df = df.sort_values(by='Total_Revenue', ascending=False)

# Select the top 20 rows with the lowest sales
top_20_lowest_sales = sorted_df.head(20)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest sales by province
ax = sns.barplot(x='Total_Revenue', y='Province', data=top_20_lowest_sales)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars
for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_x() + p.get_width(), p.get_y() + p.get_height()/2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Top 20 Highest by Province', fontsize=16)
plt.xlabel('Total_Revenue (in Rupiah)', fontsize=14)
plt.ylabel('Province', fontsize=14)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Total_Revenue' in ascending order to get the lowest values first
sorted_df = df.sort_values(by='Total_Revenue', ascending=True)

# Select the top 20 rows with the lowest total revenue
top_20_lowest_revenue = sorted_df.head(20)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest total revenue by province
ax = sns.barplot(x='Total_Revenue', y='Province', data=top_20_lowest_revenue)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars with padding
padding = 100000000  # Adjust the padding value as needed

for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_width() + padding, p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Top 20 Lowest Total Revenue by Province', fontsize=16)
plt.xlabel('Total Revenue (in Rupiah)', fontsize=14)
plt.ylabel('Province', fontsize=14)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Total_Revenue' in ascending order to get the lowest values first
sorted_df = df.sort_values(by='Serum', ascending=True)

# Select the top 20 rows with the lowest total revenue
top_20_highest_revenue = sorted_df.head(20)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest total revenue by province
ax = sns.barplot(x='Serum', y='Province', data=top_20_lowest_revenue)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars with padding
padding = 10000000  # Adjust the padding value as needed

for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_width() + padding, p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Top 20 Lowest Serum Revenue by Province', fontsize=16)
plt.xlabel('Serum (in Rupiah)', fontsize=14)
plt.ylabel('Province', fontsize=14)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Total_Revenue' in ascending order to get the lowest values first
sorted_df = df.sort_values(by='Cream', ascending=True)

# Select the top 20 rows with the lowest total revenue
top_20_lowest_revenue = sorted_df.head(20)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest total revenue by province
ax = sns.barplot(x='Cream', y='Province', data=top_20_lowest_revenue)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars with padding
padding = 10000  # Adjust the padding value as needed

for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_width() + padding, p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Top 20 Lowest Cream Revenue by Province', fontsize=16)
plt.ylabel('Province', fontsize=14)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Total_Revenue' in ascending order to get the lowest values first
sorted_df = df.sort_values(by='Toner', ascending=True)

# Select the top 20 rows with the lowest total revenue
top_20_lowest_revenue = sorted_df.head(20)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest total revenue by province
ax = sns.barplot(x='Toner', y='Province', data=top_20_lowest_revenue)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars with padding
padding = 10000  # Adjust the padding value as needed

for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_width() + padding, p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Top 20 Lowest Toner Revenue by Province', fontsize=16)
plt.ylabel('Province', fontsize=14)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Total_Revenue' in ascending order to get the lowest values first
sorted_df = df.sort_values(by='Sheet_Mask', ascending=True)

# Select the top 20 rows with the lowest total revenue
top_20_lowest_revenue = sorted_df.head(20)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest total revenue by province
ax = sns.barplot(x='Sheet_Mask', y='Province', data=top_20_lowest_revenue)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars with padding
padding = 10000  # Adjust the padding value as needed

for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_width() + padding, p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Top 20 Lowest Sheet Mask Revenue by Province', fontsize=16)
plt.ylabel('Province', fontsize=14)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Total_Revenue' in ascending order to get the lowest values first
sorted_df = df.sort_values(by='Facial_Wash', ascending=True)

# Select the top 20 rows with the lowest total revenue
top_20_lowest_revenue = sorted_df.head(20)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest total revenue by province
ax = sns.barplot(x='Facial_Wash', y='Province', data=top_20_lowest_revenue)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars with padding
padding = 10000  # Adjust the padding value as needed

for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_width() + padding, p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Top 20 Lowest Sheet Mask Revenue by Province', fontsize=16)
plt.ylabel('Province', fontsize=14)

plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

# Sort the dataset by 'Nigh Wash' in ascending order to get the lowest values first
sorted_df = df.sort_values(by='Night_Cream', ascending=True)

# Select the top 20 rows with the lowest total revenue
top_20_lowest_revenue = sorted_df.head(20)

plt.figure(figsize=(18, 8))

# Create a bar plot for the top 20 lowest total revenue by province
ax = sns.barplot(x='Night_Cream', y='Province', data=top_20_lowest_revenue)

# Format the x-axis labels as currency in Rupiah
def rupiah_formatter(x, pos):
    return 'Rp {:,.0f}'.format(x)

ax.xaxis.set_major_formatter(FuncFormatter(rupiah_formatter))

# Annotate the data values on the bars with padding
padding = 10000  # Adjust the padding value as needed

for p in ax.patches:
    ax.annotate(f'Rp {p.get_width():,.0f}', (p.get_width() + padding, p.get_y() + p.get_height() / 2),
                ha='left', va='center', fontsize=12, color='black')

plt.title('Top 20 Lowest Night Cream Revenue by Province', fontsize=16)
plt.ylabel('Province', fontsize=14)

plt.show()


In [None]:
sns.pairplot(df[['Total_Revenue','Serum','Cream','Toner','Facial_Wash','Sheet_Mask','Night_Cream']])
plt.suptitle('Pairplot of Key Variables', y=1.02)
plt.show()