# Меры разброса

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px


usa = pd.read_csv('UrbanizationCensusTract.csv')
usa.head()

Unnamed: 0.1,Unnamed: 0,statefips,state,gisjoin,lat_tract,long_tract,population,adj_radiuspop_5,urbanindex,log_pop
0,0,1,Alabama,G0100010020100,32.47718,-86.49007,1845,44076.0,10.69367,7.520235
1,1,1,Alabama,G0100010020200,32.47425,-86.47337,2172,43008.0,10.66914,7.683404
2,2,1,Alabama,G0100010020300,32.47543,-86.46019,3385,38987.0,10.57098,8.127109
3,3,1,Alabama,G0100010020400,32.472,-86.44363,4267,43131.0,10.672,8.358666
4,4,1,Alabama,G0100010020500,32.45883,-86.42266,9965,43131.0,10.672,9.206834


In [8]:
trace = go.Histogram(x=usa['urbanindex'])
fig = go.Figure(data=trace)
fig

In [9]:
trace = go.Box(x=usa['population'])
fig = go.Figure(data=trace)
fig

In [10]:
alaska = usa[usa['state'] == 'Alaska']
trace = go.Histogram(x=alaska['population'])
fig = go.Figure(data=trace)
fig

In [11]:
trace = go.Bar(x=alaska['gisjoin'], y=alaska['population'])

fig = go.Figure(data=[trace])

fig.update_layout(
    title="Распределение населения по районам Аляски",
    xaxis_title="Номер переписного района (gisjoin)",
    yaxis_title="Число жителей"
)

fig

In [13]:
grouped_data = usa.groupby('state')

fig = go.Figure()

for state, group in grouped_data:
    fig.add_trace(go.Box(
        y=group['urbanindex'],
        name=state,
        boxmean=True
    ))

fig.update_layout(
    title="Ящики с усами по переменной urbanindex для каждого штата",
    yaxis_title="Urban Index",
    xaxis_title="Штат",
    showlegend=False
)
fig

In [19]:
states_with_no_outliers = []
grouped_data = usa.groupby('state')

for state, group in grouped_data:
    Q1 = group['urbanindex'].quantile(0.25)
    Q3 = group['urbanindex'].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    if not group['urbanindex'].between(lower_bound, upper_bound).any():
        states_with_no_outliers.append(state)

print("Штаты без выбросов по переменной urbanindex:")
for state in states_with_no_outliers:
    print(state)

Штаты без выбросов по переменной urbanindex:


In [24]:
def has_outliers(data):
    Q1 = data['urbanindex'].quantile(0.25)
    Q3 = data['urbanindex'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return not data['urbanindex'].between(lower_bound, upper_bound).all()

grouped_data = usa.groupby('state')

states_with_no_outliers = []

for state, group in grouped_data:
    if not has_outliers(group):
        states_with_no_outliers.append(state)
answers = ['California', 'Nebraska','Arizona', 'Texas']

print("Штаты без выбросов по переменной urbanindex:")
for state in states_with_no_outliers:
    if state in answers:
      print(state)

Штаты без выбросов по переменной urbanindex:
Nebraska


In [28]:
correlation_matrix = usa[['population', 'adj_radiuspop_5', 'urbanindex']].corr()

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Viridis',
    colorbar_title='Корреляция'
))

fig.update_layout(
    title='Корреляция между переменными',
    xaxis_title='Переменные',
    yaxis_title='Переменные'
)

fig.show()

In [33]:
def has_outliers(data):
    Q1 = data['population'].quantile(0.25)
    Q3 = data['population'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return not data['population'].between(lower_bound, upper_bound).all()

grouped_data = usa.groupby('state')

answers = ['Alaska', 'Oklahoma', 'North Dakota', 'Montana']

states_with_no_outliers = []

for state in answers:
    if state in grouped_data.groups:
        group = grouped_data.get_group(state)
        if not has_outliers(group):
            states_with_no_outliers.append(state)

print("Штаты без выбросов из указанных вариантов:")
print(states_with_no_outliers)

print("Количество штатов без выбросов из указанных вариантов:", len(states_with_no_outliers))


Штаты без выбросов из указанных вариантов:
[]
Количество штатов без выбросов из указанных вариантов: 0


In [34]:
alaska = usa[usa['state'] == 'Oklahoma']
trace = go.Histogram(x=alaska['urbanindex'])
fig = go.Figure(data=trace)
fig

In [35]:
def count_outliers(data):
    Q1 = data['urbanindex'].quantile(0.25)
    Q3 = data['urbanindex'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return len(data['urbanindex'].between(lower_bound, upper_bound))

grouped_data = usa.groupby('state')

answers = ['Kansas', 'Kentucky']

states_with_no_outliers = []

for state in answers:
      group = grouped_data.get_group(state)
      print(count_outliers(group))

759
1109


In [36]:
grouped = usa.groupby('State')
correlation_matrix = usa[['population', 'adj_radiuspop_5']].corr()

fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='Viridis',
    colorbar_title='Корреляция'
))

fig.update_layout(
    title='Корреляция между переменными',
    xaxis_title='Переменные',
    yaxis_title='Переменные'
)

fig.show()

In [40]:
grouped_data = usa.groupby('statefips')

correlation_by_state = {}

for statefip, group in grouped_data:
    correlation = group[['population', 'adj_radiuspop_5']].corr().iloc[0, 1]
    correlation_by_state[statefip] = correlation

max_corr_statefip = max(correlation_by_state, key=lambda k: abs(correlation_by_state[k]))
max_corr_value = correlation_by_state[max_corr_statefip]

print(f"Штат с самой высокой корреляцией: {max_corr_statefip} (Корреляция: {max_corr_value})")

Штат с самой высокой корреляцией: 22 (Корреляция: -0.38216234663845183)
