In [84]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Load your dataset
data = pd.read_csv('hydcolleges1.csv')
data


Unnamed: 0,colleges,course fee,placement,review,ranking,number of students placed out of 100
0,jntuh,119000.0,4600000.0,3.8,6.0,99.0
1,vardhaman,143000.0,624000.0,4.0,51.0,86.0
2,ibs,1600000.0,1000000.0,4.1,21.0,77.0
3,vnrvjit,860000.0,800000.0,4.2,25.0,90.0
4,woxsen,770000.0,900000.0,4.2,12.0,56.0
5,bvrit,138000.0,2600000.0,4.2,179.0,79.0
6,iit,53000.0,2000000.0,4.3,681.0,89.0
7,isb,3629260.0,3421000.0,4.6,31.0,85.0
8,,,,,,


In [85]:
# Remove row with index 2
df_cleaned = data.drop(index=8)


In [86]:
# Reset index
df_cleaned.reset_index(drop=True, inplace=True)


In [87]:
print(df_cleaned)


    colleges  course fee  placement  review  ranking  \
0      jntuh    119000.0  4600000.0     3.8      6.0   
1  vardhaman    143000.0   624000.0     4.0     51.0   
2        ibs   1600000.0  1000000.0     4.1     21.0   
3    vnrvjit    860000.0   800000.0     4.2     25.0   
4     woxsen    770000.0   900000.0     4.2     12.0   
5      bvrit    138000.0  2600000.0     4.2    179.0   
6        iit     53000.0  2000000.0     4.3    681.0   
7        isb   3629260.0  3421000.0     4.6     31.0   

   number of students placed out of 100  
0                                  99.0  
1                                  86.0  
2                                  77.0  
3                                  90.0  
4                                  56.0  
5                                  79.0  
6                                  89.0  
7                                  85.0  


In [88]:
# Separate features (X) and labels (y)
X = df_cleaned[['review','ranking']]  # Include all relevant features
y = df_cleaned['colleges']  # Assuming 'college_name' is the column with college names or IDs

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [89]:
# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [90]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.0


In [91]:
# Example prediction based on new data
new_data = pd.DataFrame([[8.5, 9.0]], columns=['review','ranking'])
predicted_college = model.predict(new_data)

print(f"The predicted best college is: {predicted_college[0]}")


The predicted best college is: isb


In [92]:
# Example input for prediction
new_data = pd.DataFrame([[8.5, 9]], columns=['review', 'ranking'])

# Predict the placement offer
predicted_placement = model.predict(new_data)

# Find the college with the highest predicted placement offer (assuming higher is better)
best_college_index = predicted_placement.argmax()
best_college = data.loc[best_college_index, 'colleges']

print(f"The predicted best college based on placement offer is: {best_college}")


The predicted best college based on placement offer is: jntuh


In [93]:
# Example input for prediction (you can adjust these values accordingly)
new_data = pd.DataFrame([[4.5, 50]], columns=['review', 'ranking'])

# Predict the placement offer
predicted_placement = model.predict(new_data)

# Find the college with the highest predicted placement offer
best_college_index = predicted_placement.argmax()
best_college = data.loc[best_college_index, 'colleges']

print(f"The predicted best college based on placement offer is: {best_college}")


The predicted best college based on placement offer is: jntuh


In [94]:
# Initialize the classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)


In [95]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.0


In [96]:
new_data = pd.DataFrame([[4.5, 60]], columns=['review', 'ranking']) # Correct order of columns

# Predict the review category
predicted_category = model.predict(new_data)

if predicted_category[0] == 'best':
    best_college_index = data['review'].idxmax()
else:
    best_college_index = data['review'].idxmin()

best_college = data.loc[best_college_index, 'colleges']

print(f"The predicted best college based on review is: {best_college}")

The predicted best college based on review is: jntuh


In [97]:
# Find the college with the highest number of students placed
best_college_index = df_cleaned['number of students placed out of 100'].idxmax()
best_college = df_cleaned.loc[best_college_index, 'colleges']

print(f"The best college based on the number of students placed is: {best_college}")


The best college based on the number of students placed is: jntuh


In [98]:
df=df_cleaned
# Initialize the scaler
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Normalize the selected columns
df[['placement', 'number of students placed out of 100', 'review']] = scaler.fit_transform(df[['placement', 'number of students placed out of 100', 'review']])


In [99]:
# Define weights for each criterion
weights = {
    'placement': 0.2,
    'number of students placed out of 100': 0.7,
    'review': 0.1
}

# Calculate the combined score
df['combined_score'] = (df['placement'] * weights['placement'] +
                        df['number of students placed out of 100'] * weights['number of students placed out of 100'] +
                        df['review'] * weights['review'])


In [100]:
# Find the college with the highest combined score
best_college_index = df['combined_score'].idxmax()
best_college = df.loc[best_college_index, 'colleges']

print(f"The best college based on combined criteria is: {best_college}")


The best college based on combined criteria is: jntuh


In [101]:
def calculate_ranking(feature_values, ascending=True):
    # Calculate ranks based on ascending or descending order
    ranks = feature_values.rank(ascending=ascending, method='dense')
    return ranks

# Calculate ranks for each feature
df['course_fee_rank'] = calculate_ranking(df['course fee'])
df['placement_rank'] = calculate_ranking(df['placement'], ascending=False)
df['review_rank'] = calculate_ranking(df['review'], ascending=False)
df['ranking_rank'] = calculate_ranking(df['ranking'])
df['no_of_students_placed_rank'] = calculate_ranking(df['number of students placed out of 100'], ascending=False)


In [102]:
# Define a function to determine best and worst features
def best_worst_features(row):
    features = ['course_fee', 'placement', 'review', 'ranking', 'no_of_students_placed']
    best_feature = min(features, key=lambda f: row[f + '_rank'])
    worst_feature = max(features, key=lambda f: row[f + '_rank'])
    return best_feature, worst_feature

# Apply the function row-wise
df[['best_feature', 'worst_feature']] = df.apply(best_worst_features, axis=1, result_type='expand')

# Display the DataFrame with best and worst features
print(df[['colleges', 'best_feature', 'worst_feature']])


    colleges           best_feature          worst_feature
0      jntuh              placement                 review
1  vardhaman             course_fee              placement
2        ibs                ranking             course_fee
3    vnrvjit  no_of_students_placed              placement
4     woxsen                ranking  no_of_students_placed
5      bvrit             course_fee                ranking
6        iit             course_fee                ranking
7        isb                 review             course_fee


In [103]:
df1=df_cleaned

In [104]:
df1

Unnamed: 0,colleges,course fee,placement,review,ranking,number of students placed out of 100,combined_score,course_fee_rank,placement_rank,review_rank,ranking_rank,no_of_students_placed_rank,best_feature,worst_feature
0,jntuh,119000.0,1.0,0.0,6.0,1.0,0.9,2.0,1.0,6.0,1.0,1.0,placement,review
1,vardhaman,143000.0,0.0,0.25,51.0,0.697674,0.513372,4.0,8.0,5.0,6.0,4.0,course_fee,placement
2,ibs,1600000.0,0.094567,0.375,21.0,0.488372,0.398274,7.0,5.0,4.0,3.0,7.0,ranking,course_fee
3,vnrvjit,860000.0,0.044266,0.5,25.0,0.790698,0.612341,6.0,7.0,3.0,4.0,2.0,no_of_students_placed,placement
4,woxsen,770000.0,0.069416,0.5,12.0,0.0,0.063883,5.0,6.0,3.0,2.0,8.0,ranking,no_of_students_placed
5,bvrit,138000.0,0.496982,0.5,179.0,0.534884,0.523815,3.0,3.0,3.0,7.0,6.0,course_fee,ranking
6,iit,53000.0,0.346076,0.625,681.0,0.767442,0.668925,1.0,4.0,2.0,8.0,3.0,course_fee,ranking
7,isb,3629260.0,0.703471,1.0,31.0,0.674419,0.712787,8.0,2.0,1.0,5.0,5.0,review,course_fee


In [105]:
data

Unnamed: 0,colleges,course fee,placement,review,ranking,number of students placed out of 100
0,jntuh,119000.0,4600000.0,3.8,6.0,99.0
1,vardhaman,143000.0,624000.0,4.0,51.0,86.0
2,ibs,1600000.0,1000000.0,4.1,21.0,77.0
3,vnrvjit,860000.0,800000.0,4.2,25.0,90.0
4,woxsen,770000.0,900000.0,4.2,12.0,56.0
5,bvrit,138000.0,2600000.0,4.2,179.0,79.0
6,iit,53000.0,2000000.0,4.3,681.0,89.0
7,isb,3629260.0,3421000.0,4.6,31.0,85.0
8,,,,,,


In [106]:
df_cleaned

Unnamed: 0,colleges,course fee,placement,review,ranking,number of students placed out of 100,combined_score,course_fee_rank,placement_rank,review_rank,ranking_rank,no_of_students_placed_rank,best_feature,worst_feature
0,jntuh,119000.0,1.0,0.0,6.0,1.0,0.9,2.0,1.0,6.0,1.0,1.0,placement,review
1,vardhaman,143000.0,0.0,0.25,51.0,0.697674,0.513372,4.0,8.0,5.0,6.0,4.0,course_fee,placement
2,ibs,1600000.0,0.094567,0.375,21.0,0.488372,0.398274,7.0,5.0,4.0,3.0,7.0,ranking,course_fee
3,vnrvjit,860000.0,0.044266,0.5,25.0,0.790698,0.612341,6.0,7.0,3.0,4.0,2.0,no_of_students_placed,placement
4,woxsen,770000.0,0.069416,0.5,12.0,0.0,0.063883,5.0,6.0,3.0,2.0,8.0,ranking,no_of_students_placed
5,bvrit,138000.0,0.496982,0.5,179.0,0.534884,0.523815,3.0,3.0,3.0,7.0,6.0,course_fee,ranking
6,iit,53000.0,0.346076,0.625,681.0,0.767442,0.668925,1.0,4.0,2.0,8.0,3.0,course_fee,ranking
7,isb,3629260.0,0.703471,1.0,31.0,0.674419,0.712787,8.0,2.0,1.0,5.0,5.0,review,course_fee


In [107]:
import pandas as pd

# Example dataset with various attributes
data = {
    'colleges': ['vardhaman', 'isb', 'iit', 'woxsen', 'ibs', 'vnrvjit', 'jntuh', 'bvrit'],
    'course_fee': [143000, 3629260, 53000, 770000, 1600000, 860000, 119000, 138000],
    'placement': [624000, 3421000, 2000000, 900000, 1000000, 800000, 4600000, 2650000],
    'review': [4.0, 4.6, 4.3, 4.2, 4.1, 4.2, 3.8, 4.2],
    'ranking': [51, 31, 681, 12, 21, 25, 6, 179],
    'no_of_students_placed': [85, 92, 78, 88, 90, 84, 91, 87]  # Placement percentage out of 100
}

df = pd.DataFrame(data)

# Define weights for each criterion (you can adjust these weights based on priority)
weights = {
    'course_fee': 0.2,
    'placement': 0.3,
    'review': 0.3,
    'ranking': 0.2,
    'no_of_students_placed': 0.4
}

# Normalize the data for weighted sum calculation
df_normalized = df.copy()
for column in ['course_fee', 'placement', 'review', 'ranking', 'no_of_students_placed']:
    df_normalized[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

# Calculate the weighted sum
df_normalized['weighted_sum'] = (
    weights['course_fee'] * df_normalized['course_fee'] +
    weights['placement'] * df_normalized['placement'] +
    weights['review'] * df_normalized['review'] +
    weights['ranking'] * df_normalized['ranking'] +
    weights['no_of_students_placed'] * df_normalized['no_of_students_placed']
)

# Find the college with the highest weighted sum
best_college_index = df_normalized['weighted_sum'].idxmax()
best_college = df.loc[best_college_index, 'colleges']

print(f"The best college based on the weighted sum model is: {best_college}")


The best college based on the weighted sum model is: isb


In [108]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Example dataset with various attributes and labels
data = {
    'colleges': ['vardhaman', 'isb', 'iit', 'woxsen', 'ibs', 'vnrvjit', 'jntuh', 'bvrit'],
    'course_fee': [143000, 3629260, 53000, 770000, 1600000, 860000, 119000, 138000],
    'placement': [624000, 3421000, 2000000, 900000, 1000000, 800000, 4600000, 2650000],
    'review': [4.0, 4.6, 4.3, 4.2, 4.1, 4.2, 3.8, 4.2],
    'ranking': [51, 31, 681, 12, 21, 25, 6, 179],
    'no_of_students_placed': [85, 92, 78, 88, 90, 84, 91, 87],  # Placement percentage out of 100
    'label': ['good', 'best', 'average', 'good', 'good', 'average', 'best', 'average']
}

df = pd.DataFrame(data)

# Encode labels to numerical values
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# Prepare features and target
X = df[['course_fee', 'placement', 'review', 'ranking', 'no_of_students_placed']]
y = df['label_encoded']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Determine the best college based on predictions
best_college_index = y_pred.argmax()
best_college = df.loc[best_college_index, 'colleges']

print(f"The best college predicted by the classifier is: {best_college}")


The best college predicted by the classifier is: vardhaman


In [109]:
from sklearn.decomposition import PCA

# Example dataset with various attributes
data = {
    'colleges': ['vardhaman', 'isb', 'iit', 'woxsen', 'ibs', 'vnrvjit', 'jntuh', 'bvrit'],
    'course_fee': [143000, 3629260, 53000, 770000, 1600000, 860000, 119000, 138000],
    'placement': [624000, 3421000, 2000000, 900000, 1000000, 800000, 4600000, 2650000],
    'review': [4.0, 4.6, 4.3, 4.2, 4.1, 4.2, 3.8, 4.2],
    'ranking': [51, 31, 681, 12, 21, 25, 6, 179],
    'no_of_students_placed': [85, 92, 78, 88, 90, 84, 91, 87]  # Placement percentage out of 100
}

df = pd.DataFrame(data)

# Prepare features for PCA
X = df[['course_fee', 'placement', 'review', 'ranking', 'no_of_students_placed']]

# Apply PCA for dimensionality reduction
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Use X_pca as input for any of the methods above (e.g., weighted sum model, classifier)

# Example: Calculate weighted sum on reduced dataset
df_pca = pd.DataFrame(X_pca, columns=['PCA_component_1', 'PCA_component_2'])
df_pca['weighted_sum'] = (
    weights['course_fee'] * df['course_fee'] +
    weights['placement'] * df['placement'] +
    weights['review'] * df['review'] +
    weights['ranking'] * df['ranking'] +
    weights['no_of_students_placed'] * df['no_of_students_placed']
)

# Find the college with the highest weighted sum
best_college_index = df_pca['weighted_sum'].idxmax()
best_college = df.loc[best_college_index, 'colleges']

print(f"The best college based on PCA and weighted sum model is: {best_college}")


The best college based on PCA and weighted sum model is: isb


In [110]:
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.preprocessing import StandardScaler

# Example dataset with various attributes
data = {
    'colleges': ['vardhaman', 'isb', 'iit', 'woxsen', 'ibs', 'vnrvjit', 'jntuh', 'bvrit'],
    'course_fee': [143000, 3629260, 53000, 770000, 1600000, 860000, 119000, 138000],
    'placement': [624000, 3421000, 2000000, 900000, 1000000, 800000, 4600000, 2650000],
    'review': [4.0, 4.6, 4.3, 4.2, 4.1, 4.2, 3.8, 4.2],
    'ranking': [51, 31, 681, 12, 21, 25, 6, 179],
    'no_of_students_placed': [85, 92, 78, 88, 90, 84, 91, 87]  # Placement percentage out of 100
}

df = pd.DataFrame(data)

# Select features for clustering
X = df[['course_fee', 'placement', 'review', 'ranking', 'no_of_students_placed']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply hierarchical clustering
linkage_matrix = linkage(X_scaled, method='ward', metric='euclidean')
df['cluster'] = fcluster(linkage_matrix, 3, criterion='maxclust')  # Adjust number of clusters as needed

# Calculate the mean for numeric columns only
cluster_means = df.groupby('cluster').mean(numeric_only=True)

# Find the cluster with the highest average score (e.g., placement, review, etc.)
best_cluster = cluster_means['placement'].idxmax()  # Example: Choose based on placement average

# Find the best college in the best cluster
best_college_index = df[df['cluster'] == best_cluster]['placement'].idxmax()
best_college = df.loc[best_college_index, 'colleges']

print(f"The best college based on hierarchical clustering is: {best_college}")

The best college based on hierarchical clustering is: isb


In [111]:
import pandas as pd
from textblob import TextBlob

# Example dataset with colleges and reviews
data = {
    'colleges': ['vardhaman', 'isb', 'iit', 'woxsen', 'ibs', 'vnrvjit', 'jntuh', 'bvrit'],
    'review_text': [
        'Excellent college with great placements and faculty.',
        'Top business school with excellent placements and infrastructure.',
        'Premier institute known for its rigorous curriculum and research.',
        'Good college with decent placements and supportive faculty.',
        'Well-known for its management program and good campus life.',
        'One of the top engineering colleges with good industry ties.',
        'Known for its engineering programs and good campus facilities.',
        'Offers a wide range of programs with decent placement support.'
    ]
}

df = pd.DataFrame(data)

df['sentiment'] = df['review_text'].apply(lambda x: TextBlob(x).sentiment.polarity)
best_college_index = df['sentiment'].idxmax()
best_college = df.loc[best_college_index, 'colleges']
print(f"The best college based on sentiment analysis is: {best_college}")


The best college based on sentiment analysis is: vardhaman


In [112]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Example dataset with various attributes
data = {
    'colleges': ['vardhaman', 'isb', 'iit', 'woxsen', 'ibs', 'vnrvjit', 'jntuh', 'bvrit'],
    'course_fee': [143000, 3629260, 53000, 770000, 1600000, 860000, 119000, 138000],
    'placement': [624000, 3421000, 2000000, 900000, 1000000, 800000, 4600000, 2650000],
    'review': [4.0, 4.6, 4.3, 4.2, 4.1, 4.2, 3.8, 4.2],
    'ranking': [51, 31, 681, 12, 21, 25, 6, 179],
    'no_of_students_placed': [85, 92, 78, 88, 90, 84, 91, 87]  # Placement percentage out of 100
}

df = pd.DataFrame(data)

# Select features for clustering
X = df[['course_fee', 'placement', 'review', 'ranking', 'no_of_students_placed']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Adjust number of clusters as needed
df['cluster'] = kmeans.fit_predict(X_scaled)

# Calculate mean for numerical columns only
cluster_means = df.groupby('cluster')[['course_fee', 'placement', 'review', 'ranking', 'no_of_students_placed']].mean()

# Find the cluster with the highest average score (e.g., placement, review, etc.)
best_cluster = cluster_means['placement'].idxmax()  # Example: Choose based on placement average

# Find the best college in the best cluster
best_college_index = df[df['cluster'] == best_cluster]['placement'].idxmax()
best_college = df.loc[best_college_index, 'colleges']

print(f"The best college based on K-means clustering is: {best_college}")

The best college based on K-means clustering is: isb




In [113]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

# Example dataset with various attributes
data = {
    'colleges': ['vardhaman', 'isb', 'iit', 'woxsen', 'ibs', 'vnrvjit', 'jntuh', 'bvrit'],
    'course_fee': [143000, 3629260, 53000, 770000, 1600000, 860000, 119000, 138000],
    'placement': [624000, 3421000, 2000000, 900000, 1000000, 800000, 4600000, 2650000],
    'review': [4.0, 4.6, 4.3, 4.2, 4.1, 4.2, 3.8, 4.2],
    'ranking': [51, 31, 681, 12, 21, 25, 6, 179],
    'no_of_students_placed': [85, 92, 78, 88, 90, 84, 91, 87]  # Placement percentage out of 100
}

df = pd.DataFrame(data)

# Define features and target
# Choose a numerical target variable instead of 'colleges'
X = df[['course_fee', 'review', 'ranking', 'no_of_students_placed']]
y = df['placement']  # Example: Predicting placement, use a suitable numerical column

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
# Consider removing or replacing classification models if your target is numerical
models = {

    'LinearRegression': LinearRegression(),
    'LogisticRegression': LogisticRegression(),
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'RandomForestClassifier': RandomForestClassifier(random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    if 'Regressor' in name:
        y_pred = model.predict(X_test)
        # Example evaluation metric for regression
        score = mean_squared_error(y_test, y_pred)
        results[name] = score

# Print results
for name, score in results.items():
    print(f"{name}: {score}")

# Determine the best model based on the lowest MSE
best_model = min(results, key=results.get)
print(f"Best Model: {best_model}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


RandomForestRegressor: 1689271533800.0
Best Model: RandomForestRegressor
