In [1]:
import pandas as pd

def fix_review_count(arg: str) -> int:
    first = arg.split()[0]
    if 'k' in first:
        return int(1000 * float(first.replace('k', '')))
    else:
        return int(first)

COMPANIES = '/kaggle/input/top-companies-rating-and-reviews/1000 companies list and rating.csv'
df = pd.read_csv(filepath_or_buffer=COMPANIES, index_col=[0])
# we need to render some numerical data as numeric before we proceed
df['age in years'] = df['age'].apply(func=lambda x: x if isinstance(x, float) else int(x.split()[0]))
df['reviewer count'] = df['reviewers'].apply(func=fix_review_count)
df.head()

Unnamed: 0,company,type,rating,reviewers,age,highly_ratedFOR,critically_ratedFOR,age in years,reviewer count
0,TCS,IT Services & Consulting,3.8,73.8k Reviews,56 years old,"Job Security, Work Life Balance","Promotions / Appraisal, Salary & Benefits",56.0,73800
1,Accenture,IT Services & Consulting,4.0,46.8k Reviews,35 years old,"Company Culture, Skill Development / Learning,...",,35.0,46800
2,Cognizant,IT Services & Consulting,3.9,42.1k Reviews,30 years old,Skill Development / Learning,Promotions / Appraisal,30.0,42100
3,Wipro,IT Services & Consulting,3.8,39.7k Reviews,79 years old,Job Security,"Promotions / Appraisal, Salary & Benefits",79.0,39700
4,Capgemini,IT Services & Consulting,3.9,34.3k Reviews,57 years old,"Job Security, Work Life Balance, Skill Develop...","Promotions / Appraisal, Salary & Benefits",57.0,34300


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   company              10000 non-null  object 
 1   type                 9860 non-null   object 
 2   rating               10000 non-null  float64
 3   reviewers            10000 non-null  object 
 4   age                  9185 non-null   object 
 5   highly_ratedFOR      9923 non-null   object 
 6   critically_ratedFOR  2822 non-null   object 
 7   age in years         9185 non-null   float64
 8   reviewer count       10000 non-null  int64  
dtypes: float64(2), int64(1), object(6)
memory usage: 781.2+ KB


In [3]:
from plotly import express
express.histogram(data_frame=df, x='rating')

In [4]:
express.histogram(data_frame=df, x='age in years', log_y=True)

Wow. Imaging working for a 2000-year-old company. Are we sure these ages are denominated in years if they are not NaN?

In [5]:
df['age'].apply(func=lambda x: x if isinstance(x, float) else ' '.join(x.split()[1:])).value_counts(dropna=False)

age
years old    9185
NaN           815
Name: count, dtype: int64

Yes. They're years.

Do we expect company ratings to be correlated with company age or the number of reviewiers?

In [6]:
express.imshow(img=df[['rating', 'age in years', 'reviewer count']].corr())

They are essentially uncorrelated.

In [7]:
express.scatter(data_frame=df, x='age in years', y='reviewer count', color='rating', hover_name='company', log_x=True, log_y=True)

Yes that's what uncorrelated data looks like.